1070 files changed, 24026 insertions, 70334 deletions
diff --git a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp b/arm_compute/Acl.h
index 091b1652c9..3e99a00cbe 100644
--- a/arm_compute/core/NEON/kernels/convolution/common/activation.hpp
+++ b/arm_compute/Acl.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,25 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#ifndef ARM_COMPUTE_ACL_H_
+#define ARM_COMPUTE_ACL_H_
 
-#pragma once
-
-namespace neon_convolution_kernels
+#ifdef __cplusplus
+extern "C"
 {
+#endif /* __cplusplus */
 
-enum class ActivationFunction
-{
-  None,
-  ReLU,
-  ReLU6,
-};
+/* Core headers */
+#include "arm_compute/AclEntrypoints.h"
+#include "arm_compute/AclTypes.h"
+#include "arm_compute/AclUtils.h"
+#include "arm_compute/AclVersion.h"
+
+/* Operator headers */
+#include "arm_compute/AclDescriptors.h"
+#include "arm_compute/AclOperators.h"
 
+#ifdef __cplusplus
 }
+#endif /* __cplusplus */
+#endif /* ARM_COMPUTE_ACL_H_ */
diff --git a/arm_compute/Acl.hpp b/arm_compute/Acl.hpp
new file mode 100644
index 0000000000..6a9d585c14
--- /dev/null
+++ b/arm_compute/Acl.hpp
@@ -0,0 +1,796 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_HPP_
+#define ARM_COMPUTE_ACL_HPP_
+
+#include "arm_compute/Acl.h"
+
+#include <cstdlib>
+#include <memory>
+#include <string>
+#include <vector>
+
+#if defined(ARM_COMPUTE_EXCEPTIONS_ENABLED)
+#include <exception>
+#endif /* defined(ARM_COMPUTE_EXCEPTIONS_ENABLED) */
+
+// Helper Macros
+#define ARM_COMPUTE_IGNORE_UNUSED(x) (void)(x)
+
+namespace acl
+{
+// Forward declarations
+class Context;
+class Queue;
+class Tensor;
+class TensorPack;
+
+/**< Status code enum */
+enum class StatusCode
+{
+    Success            = AclSuccess,
+    RuntimeError       = AclRuntimeError,
+    OutOfMemory        = AclOutOfMemory,
+    Unimplemented      = AclUnimplemented,
+    UnsupportedTarget  = AclUnsupportedTarget,
+    InvalidArgument    = AclInvalidArgument,
+    InvalidTarget      = AclInvalidTarget,
+    UnsupportedConfig  = AclUnsupportedConfig,
+    InvalidObjectState = AclInvalidObjectState,
+};
+
+/**< Utility namespace containing helpers functions */
+namespace detail
+{
+/** Construct to handle destruction of objects
+ *
+ * @tparam T Object base type
+ */
+template <typename T>
+struct ObjectDeleter
+{
+};
+
+#define OBJECT_DELETER(obj, func)              \
+    template <>                                \
+    struct ObjectDeleter<obj>                  \
+                                               \
+    {                                          \
+        static inline AclStatus Destroy(obj v) \
+        {                                      \
+            return func(v);                    \
+        }                                      \
+    };
+
+OBJECT_DELETER(AclContext, AclDestroyContext)
+OBJECT_DELETER(AclQueue, AclDestroyQueue)
+OBJECT_DELETER(AclTensor, AclDestroyTensor)
+OBJECT_DELETER(AclTensorPack, AclDestroyTensorPack)
+OBJECT_DELETER(AclOperator, AclDestroyOperator)
+
+#undef OBJECT_DELETER
+
+/** Convert a strongly typed enum to an old plain c enum
+  *
+  * @tparam E  Plain old C enum
+  * @tparam SE Strongly typed resulting enum
+  *
+  * @param[in] v Value to convert
+  *
+  * @return A corresponding plain old C enumeration
+  */
+template <typename E, typename SE>
+constexpr E as_cenum(SE v) noexcept
+{
+    return static_cast<E>(static_cast<typename std::underlying_type<SE>::type>(v));
+}
+
+/** Convert plain old enumeration to a strongly typed enum
+  *
+  * @tparam SE Strongly typed resulting enum
+  * @tparam E  Plain old C enum
+  *
+  * @param[in] val Value to convert
+  *
+  * @return A corresponding strongly typed enumeration
+  */
+template <typename SE, typename E>
+constexpr SE as_enum(E val) noexcept
+{
+    return static_cast<SE>(val);
+}
+
+/** Object base class for library objects
+ *
+ * Class is defining basic common interface for all the library objects
+ *
+ * @tparam T Object type to be templated on
+ */
+template <typename T>
+class ObjectBase
+{
+public:
+    /** Destructor */
+    ~ObjectBase() = default;
+    /** Copy constructor */
+    ObjectBase(const ObjectBase<T> &) = default;
+    /** Move Constructor */
+    ObjectBase(ObjectBase<T> &&) = default;
+    /** Copy assignment operator */
+    ObjectBase<T> &operator=(const ObjectBase<T> &) = default;
+    /** Move assignment operator */
+    ObjectBase<T> &operator=(ObjectBase<T> &&) = default;
+    /** Reset object value
+     *
+     * @param [in] val Value to set
+     */
+    void reset(T *val)
+    {
+        _object.reset(val, detail::ObjectDeleter<T *>::Destroy);
+    }
+    /** Access uderlying object
+     *
+     * @return Underlying object
+     */
+    const T *get() const
+    {
+        return _object.get();
+    }
+    /** Access uderlying object
+     *
+     * @return Underlying object
+     */
+    T *get()
+    {
+        return _object.get();
+    }
+
+protected:
+    /** Constructor */
+    ObjectBase() = default;
+
+protected:
+    std::shared_ptr<T> _object{nullptr}; /**< Library object */
+};
+
+/** Equality operator for library object
+ *
+ * @tparam T Parameter to template on
+ *
+ * @param[in] lhs Left hand-side argument
+ * @param[in] rhs Right hand-side argument
+ *
+ * @return True if objects are equal, else false
+ */
+template <typename T>
+bool operator==(const ObjectBase<T> &lhs, const ObjectBase<T> &rhs)
+{
+    return lhs.get() == rhs.get();
+}
+
+/** Inequality operator for library object
+ *
+ * @tparam T Parameter to template on
+ *
+ * @param[in] lhs Left hand-side argument
+ * @param[in] rhs Right hand-side argument
+ *
+ * @return True if objects are equal, else false
+ */
+template <typename T>
+bool operator!=(const ObjectBase<T> &lhs, const ObjectBase<T> &rhs)
+{
+    return !(lhs == rhs);
+}
+} // namespace detail
+
+#if defined(ARM_COMPUTE_EXCEPTIONS_ENABLED)
+/** Status class
+ *
+ * Class is an extension of std::exception and contains the underlying
+ * status construct and an error explanatory message to be reported.
+ *
+ * @note Class is visible only when exceptions are enabled during compilation
+ */
+class Status : public std::exception
+{
+public:
+    /** Constructor
+     *
+     * @param[in] status Status returned
+     * @param[in] msg    Error message to be bound with the exception
+     */
+    Status(StatusCode status, const std::string &msg) : _status(status), _msg(msg)
+    {
+    }
+    /** Returns an explanatory exception message
+     *
+     * @return Status message
+     */
+    const char *what() const noexcept override
+    {
+        return _msg.c_str();
+    }
+    /** Underlying status accessor
+     *
+     * @return Status code
+     */
+    StatusCode status() const
+    {
+        return _status;
+    }
+    /** Explicit status converter
+     *
+     * @return Status code
+     */
+    explicit operator StatusCode() const
+    {
+        return _status;
+    }
+
+private:
+    StatusCode  _status; /**< Status code */
+    std::string _msg;    /**< Status message */
+};
+
+/** Reports an error status and throws an exception object in case of failure
+ *
+ * @note This implementation is used when exceptions are enabled during compilation
+ *
+ * @param[in] status Status to report
+ * @param[in] msg    Explanatory error messaged
+ *
+ * @return Status code
+ */
+static inline void report_status(StatusCode status, const std::string &msg)
+{
+    if (status != StatusCode::Success)
+    {
+        throw Status(status, msg);
+    }
+}
+#else  /* defined(ARM_COMPUTE_EXCEPTIONS_ENABLED) */
+/** Reports a status code
+ *
+ * @note This implementation is used when exceptions are disabled during compilation
+ * @note Message is surpressed and not reported in this case
+ *
+ * @param[in] status Status to report
+ * @param[in] msg    Explanatory error messaged
+ *
+ * @return Status code
+ */
+static inline void report_status(StatusCode status, const std::string &msg)
+{
+    ARM_COMPUTE_IGNORE_UNUSED(status);
+    ARM_COMPUTE_IGNORE_UNUSED(msg);
+}
+#endif /* defined(ARM_COMPUTE_EXCEPTIONS_ENABLED) */
+
+/**< Target enum */
+enum class Target
+{
+    Cpu    = AclCpu,   /**< Cpu target that leverages SIMD */
+    GpuOcl = AclGpuOcl /**< Gpu target that leverages OpenCL */
+};
+
+/**< Available execution modes */
+enum class ExecutionMode
+{
+    FastRerun =
+        AclPreferFastRerun, /**< Prefer minimum latency in consecutive runs, might introduce higher startup times */
+    FastStart = AclPreferFastStart, /**< Prefer minimizing startup time */
+};
+
+/** Context class
+ *
+ * Context acts as a central aggregate service for further objects created from it.
+ * It provides, internally, common facilities in order to avoid the use of global
+ * statically initialized objects that can lead to important side-effect under
+ * specific execution contexts.
+ *
+ * For example context contains allocators for object creation, for further backing memory allocation,
+ * any serialization interfaces and other modules that affect the construction of objects,
+ * like program caches for OpenCL.
+ */
+class Context : public detail::ObjectBase<AclContext_>
+{
+public:
+    /**< Context options */
+    struct Options
+    {
+        static constexpr int32_t num_threads_auto = -1; /**< Allow runtime to specify number of threads */
+
+        /** Default Constructor
+         *
+         * @note By default no precision loss is enabled for operators
+         * @note By default the preferred execution mode is to favor multiple consecutive reruns of an operator
+         */
+        Options()
+            : Options(ExecutionMode::FastRerun /* mode */,
+                      AclCpuCapabilitiesAuto /* caps */,
+                      false /* enable_fast_math */,
+                      nullptr /* kernel_config */,
+                      num_threads_auto /* max_compute_units */,
+                      nullptr /* allocator */)
+        {
+        }
+        /** Constructor
+         *
+         * @param[in] mode              Execution mode to be used
+         * @param[in] caps              Capabilities to be used
+         * @param[in] enable_fast_math  Allow precision loss in favor of performance
+         * @param[in] kernel_config     Kernel configuration file containing construction tuning meta-data
+         * @param[in] max_compute_units Max compute units that are expected to used
+         * @param[in] allocator         Allocator to be used for internal memory allocation
+         */
+        Options(ExecutionMode         mode,
+                AclTargetCapabilities caps,
+                bool                  enable_fast_math,
+                const char           *kernel_config,
+                int32_t               max_compute_units,
+                AclAllocator         *allocator)
+        {
+            copts.mode               = detail::as_cenum<AclExecutionMode>(mode);
+            copts.capabilities       = caps;
+            copts.enable_fast_math   = enable_fast_math;
+            copts.kernel_config_file = kernel_config;
+            copts.max_compute_units  = max_compute_units;
+            copts.allocator          = allocator;
+        }
+
+        AclContextOptions copts{};
+    };
+
+public:
+    /** Constructor
+     *
+     * @note Serves as a simpler delegate constructor
+     * @note As context options, default conservative options will be used
+     *
+     * @param[in]  target Target to create context for
+     * @param[out] status Status information if requested
+     */
+    explicit Context(Target target, StatusCode *status = nullptr) : Context(target, Options(), status)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in]  target  Target to create context for
+     * @param[in]  options Context construction options
+     * @param[out] status  Status information if requested
+     */
+    Context(Target target, const Options &options, StatusCode *status = nullptr)
+    {
+        AclContext ctx;
+        const auto st =
+            detail::as_enum<StatusCode>(AclCreateContext(&ctx, detail::as_cenum<AclTarget>(target), &options.copts));
+        reset(ctx);
+        report_status(st, "[Compute Library] Failed to create context");
+        if (status)
+        {
+            *status = st;
+        }
+    }
+};
+
+/**< Available tuning modes */
+enum class TuningMode
+{
+    Rapid      = AclRapid,
+    Normal     = AclNormal,
+    Exhaustive = AclExhaustive
+};
+
+/** Queue class
+ *
+ * Queue is responsible for the execution related aspects, with main responsibilities those of
+ * scheduling and tuning operators.
+ *
+ * Multiple queues can be created from the same context, and the same operator can be scheduled on each concurrently.
+ *
+ * @note An operator might depend on the maximum possible compute units that are provided in the context,
+ *       thus in cases where the number of the scheduling units of the queue are greater might lead to errors.
+ */
+class Queue : public detail::ObjectBase<AclQueue_>
+{
+public:
+    /**< Queue options */
+    struct Options
+    {
+        /** Default Constructor
+         *
+         * As default options, no tuning will be performed, and the number of scheduling units will
+         * depends on internal device discovery functionality
+         */
+        Options() : opts{AclTuningModeNone, 0} {};
+        /** Constructor
+         *
+         * @param[in] mode          Tuning mode to be used
+         * @param[in] compute_units Number of scheduling units to be used
+         */
+        Options(TuningMode mode, int32_t compute_units) : opts{detail::as_cenum<AclTuningMode>(mode), compute_units}
+        {
+        }
+
+        AclQueueOptions opts;
+    };
+
+public:
+    /** Constructor
+     *
+     * @note Serves as a simpler delegate constructor
+     * @note As queue options, default conservative options will be used
+     *
+     * @param[in]  ctx    Context to create queue for
+     * @param[out] status Status information if requested
+     */
+    explicit Queue(Context &ctx, StatusCode *status = nullptr) : Queue(ctx, Options(), status)
+    {
+    }
+    /** Constructor
+     *
+     * @note As queue options, default conservative options will be used
+     *
+     * @param[in]  ctx     Context from where the queue will be created from
+     * @param[in]  options Queue options to be used
+     * @param[out] status  Status information if requested
+     */
+    explicit Queue(Context &ctx, const Options &options = Options(), StatusCode *status = nullptr)
+    {
+        AclQueue   queue;
+        const auto st = detail::as_enum<StatusCode>(AclCreateQueue(&queue, ctx.get(), &options.opts));
+        reset(queue);
+        report_status(st, "[Compute Library] Failed to create queue!");
+        if (status)
+        {
+            *status = st;
+        }
+    }
+    /** Block until all the tasks of the queue have been marked as finished
+     *
+     * @return Status code
+     */
+    StatusCode finish()
+    {
+        return detail::as_enum<StatusCode>(AclQueueFinish(_object.get()));
+    }
+};
+
+/**< Data type enumeration */
+enum class DataType
+{
+    Unknown  = AclDataTypeUnknown,
+    UInt8    = AclUInt8,
+    Int8     = AclInt8,
+    UInt16   = AclUInt16,
+    Int16    = AclInt16,
+    UInt32   = AclUint32,
+    Int32    = AclInt32,
+    Float16  = AclFloat16,
+    BFloat16 = AclBFloat16,
+    Float32  = AclFloat32,
+};
+
+/** Tensor Descriptor class
+ *
+ * Structure that contains all the required meta-data to represent a tensor
+ */
+class TensorDescriptor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] shape Shape of the tensor
+     * @param[in] data_type Data type of the tensor
+     */
+    TensorDescriptor(const std::vector<int32_t> &shape, DataType data_type) : _shape(shape), _data_type(data_type)
+    {
+        _cdesc.ndims     = _shape.size();
+        _cdesc.shape     = _shape.data();
+        _cdesc.data_type = detail::as_cenum<AclDataType>(_data_type);
+        _cdesc.strides   = nullptr;
+        _cdesc.boffset   = 0;
+    }
+    /** Constructor
+     *
+     * @param[in] desc C-type descriptor
+     */
+    explicit TensorDescriptor(const AclTensorDescriptor &desc)
+    {
+        _cdesc     = desc;
+        _data_type = detail::as_enum<DataType>(desc.data_type);
+        _shape.reserve(desc.ndims);
+        for (int32_t d = 0; d < desc.ndims; ++d)
+        {
+            _shape.emplace_back(desc.shape[d]);
+        }
+    }
+    /** Get underlying C tensor descriptor
+     *
+     * @return Underlying structure
+     */
+    const AclTensorDescriptor *get() const
+    {
+        return &_cdesc;
+    }
+    /** Operator to compare two TensorDescriptor
+     *
+     * @param[in] other The instance to compare against
+     *
+     * @return True if two instances have the same shape and data type
+     */
+    bool operator==(const TensorDescriptor &other)
+    {
+        bool is_same = true;
+
+        is_same &= _data_type == other._data_type;
+        is_same &= _shape.size() == other._shape.size();
+
+        if (is_same)
+        {
+            for (uint32_t d = 0; d < _shape.size(); ++d)
+            {
+                is_same &= _shape[d] == other._shape[d];
+            }
+        }
+
+        return is_same;
+    }
+
+private:
+    std::vector<int32_t> _shape{};
+    DataType             _data_type{};
+    AclTensorDescriptor  _cdesc{};
+};
+
+/** Import memory types */
+enum class ImportType
+{
+    Host = AclImportMemoryType::AclHostPtr
+};
+
+/** Tensor class
+ *
+ * Tensor is an mathematical construct that can represent an N-Dimensional space.
+ *
+ * @note Maximum dimensionality support is 6 internally at the moment
+ */
+class Tensor : public detail::ObjectBase<AclTensor_>
+{
+public:
+    /** Constructor
+     *
+     * @note Tensor memory is allocated
+     *
+     * @param[in]  ctx    Context from where the tensor will be created from
+     * @param[in]  desc   Tensor descriptor to be used
+     * @param[out] status Status information if requested
+     */
+    Tensor(Context &ctx, const TensorDescriptor &desc, StatusCode *status = nullptr) : Tensor(ctx, desc, true, status)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in]  ctx    Context from where the tensor will be created from
+     * @param[in]  desc   Tensor descriptor to be used
+     * @param[in]  allocate Flag to indicate if the tensor needs to be allocated
+     * @param[out] status Status information if requested
+     */
+    Tensor(Context &ctx, const TensorDescriptor &desc, bool allocate, StatusCode *status)
+    {
+        AclTensor  tensor;
+        const auto st = detail::as_enum<StatusCode>(AclCreateTensor(&tensor, ctx.get(), desc.get(), allocate));
+        reset(tensor);
+        report_status(st, "[Compute Library] Failed to create tensor!");
+        if (status)
+        {
+            *status = st;
+        }
+    }
+    /** Maps the backing memory of a given tensor that can be used by the host to access any contents
+     *
+     * @return A valid non-zero pointer in case of success else nullptr
+     */
+    void *map()
+    {
+        void      *handle = nullptr;
+        const auto st     = detail::as_enum<StatusCode>(AclMapTensor(_object.get(), &handle));
+        report_status(st, "[Compute Library] Failed to map the tensor and extract the tensor's backing memory!");
+        return handle;
+    }
+    /** Unmaps tensor's memory
+     *
+     * @param[in] handle Handle to unmap
+     *
+     * @return Status code
+     */
+    StatusCode unmap(void *handle)
+    {
+        const auto st = detail::as_enum<StatusCode>(AclUnmapTensor(_object.get(), handle));
+        report_status(st, "[Compute Library] Failed to unmap the tensor!");
+        return st;
+    }
+    /** Import external memory to a given tensor object
+     *
+     * @param[in] handle External memory handle
+     * @param[in] type   Type of memory to be imported
+     *
+     * @return Status code
+     */
+    StatusCode import(void *handle, ImportType type)
+    {
+        const auto st = detail::as_enum<StatusCode>(
+            AclTensorImport(_object.get(), handle, detail::as_cenum<AclImportMemoryType>(type)));
+        report_status(st, "[Compute Library] Failed to import external memory to tensor!");
+        return st;
+    }
+    /** Get the size of the tensor in byte
+     *
+     * @note The size isn't based on allocated memory, but based on information in its descriptor (dimensions, data type, etc.).
+     *
+     * @return The size of the tensor in byte
+     */
+    uint64_t get_size()
+    {
+        uint64_t   size{0};
+        const auto st = detail::as_enum<StatusCode>(AclGetTensorSize(_object.get(), &size));
+        report_status(st, "[Compute Library] Failed to get the size of the tensor");
+        return size;
+    }
+    /** Get the descriptor of this tensor
+     *
+     * @return The descriptor describing the characteristics of this tensor
+     */
+    TensorDescriptor get_descriptor()
+    {
+        AclTensorDescriptor desc;
+        const auto          st = detail::as_enum<StatusCode>(AclGetTensorDescriptor(_object.get(), &desc));
+        report_status(st, "[Compute Library] Failed to get the descriptor of the tensor");
+        return TensorDescriptor(desc);
+    }
+};
+
+/** Tensor pack class
+ *
+ * Pack is a utility construct that is used to create a collection of tensors that can then
+ * be passed into operator as inputs.
+ */
+class TensorPack : public detail::ObjectBase<AclTensorPack_>
+{
+public:
+    /** Pack pair construct */
+    struct PackPair
+    {
+        /** Constructor
+         *
+         * @param[in] tensor_ Tensor to pack
+         * @param[in] slot_id_ Slot identification of the tensor in respect with the operator
+         */
+        PackPair(Tensor *tensor_, int32_t slot_id_) : tensor(tensor_), slot_id(slot_id_)
+        {
+        }
+
+        Tensor *tensor{nullptr};         /**< Tensor object */
+        int32_t slot_id{AclSlotUnknown}; /**< Slot id in respect with the operator */
+    };
+
+public:
+    /** Constructor
+     *
+     * @param[in]  ctx    Context from where the tensor pack will be created from
+     * @param[out] status Status information if requested
+     */
+    explicit TensorPack(Context &ctx, StatusCode *status = nullptr)
+    {
+        AclTensorPack pack;
+        const auto    st = detail::as_enum<StatusCode>(AclCreateTensorPack(&pack, ctx.get()));
+        reset(pack);
+        report_status(st, "[Compute Library] Failure during tensor pack creation");
+        if (status)
+        {
+            *status = st;
+        }
+    }
+    /** Add tensor to tensor pack
+     *
+     * @param[in] slot_id Slot id of the tensor in respect with the operator
+     * @param[in] tensor  Tensor to be added in the pack
+     *
+     * @return Status code
+     */
+    StatusCode add(Tensor &tensor, int32_t slot_id)
+    {
+        return detail::as_enum<StatusCode>(AclPackTensor(_object.get(), tensor.get(), slot_id));
+    }
+    /** Add a list of tensors to a tensor pack
+     *
+     * @param[in] packed Pair packs to be added
+     *
+     * @return Status code
+     */
+    StatusCode add(std::initializer_list<PackPair> packed)
+    {
+        const size_t           size = packed.size();
+        std::vector<int32_t>   slots(size);
+        std::vector<AclTensor> tensors(size);
+        int                    i = 0;
+        for (auto &p : packed)
+        {
+            slots[i]   = p.slot_id;
+            tensors[i] = AclTensor(p.tensor);
+            ++i;
+        }
+        return detail::as_enum<StatusCode>(AclPackTensors(_object.get(), tensors.data(), slots.data(), size));
+    }
+};
+
+/** Operator class
+ *
+ * Operators are the basic algorithmic blocks responsible for performing distinct operations
+ */
+class Operator : public detail::ObjectBase<AclOperator_>
+{
+public:
+    /** Run an operator on a given input list
+     *
+     * @param[in,out] queue Queue to scheduler the operator on
+     * @param pack  Tensor list to be used as input
+     *
+     * @return Status Code
+     */
+    StatusCode run(Queue &queue, TensorPack &pack)
+    {
+        return detail::as_cenum<StatusCode>(AclRunOperator(_object.get(), queue.get(), pack.get()));
+    }
+
+protected:
+    /** Constructor */
+    Operator() = default;
+};
+
+/// Operators
+using ActivationDesc = AclActivationDescriptor;
+class Activation : public Operator
+{
+public:
+    Activation(Context                &ctx,
+               const TensorDescriptor &src,
+               const TensorDescriptor &dst,
+               const ActivationDesc   &desc,
+               StatusCode             *status = nullptr)
+    {
+        AclOperator op;
+        const auto  st = detail::as_enum<StatusCode>(AclActivation(&op, ctx.get(), src.get(), dst.get(), desc));
+        reset(op);
+        report_status(st, "[Compute Library] Failure during Activation operator creation");
+        if (status)
+        {
+            *status = st;
+        }
+    }
+};
+} // namespace acl
+#undef ARM_COMPUTE_IGNORE_UNUSED
+#endif /* ARM_COMPUTE_ACL_HPP_ */
diff --git a/arm_compute/AclDescriptors.h b/arm_compute/AclDescriptors.h
new file mode 100644
index 0000000000..cdaf7c0dc8
--- /dev/null
+++ b/arm_compute/AclDescriptors.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_DESCRIPTORS_H_
+#define ARM_COMPUTE_ACL_DESCRIPTORS_H_
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /** __cplusplus */
+
+    /**< Supported activation types */
+    typedef enum
+    {
+        AclActivationTypeNone = 0,  /**< No activation */
+        AclIdentity           = 1,  /**< Identity */
+        AclLogistic           = 2,  /**< Logistic */
+        AclTanh               = 3,  /**< Hyperbolic tangent */
+        AclRelu               = 4,  /**< Rectifier */
+        AclBoundedRelu        = 5,  /**< Upper Bounded Rectifier */
+        AclLuBoundedRelu      = 6,  /**< Lower and Upper Bounded Rectifier */
+        AclLeakyRelu          = 7,  /**< Leaky Rectifier */
+        AclSoftRelu           = 8,  /**< Soft Rectifier */
+        AclElu                = 9,  /**< Exponential Linear Unit */
+        AclAbs                = 10, /**< Absolute */
+        AclSquare             = 11, /**< Square */
+        AclSqrt               = 12, /**< Square root */
+        AclLinear             = 13, /**< Linear */
+        AclHardSwish          = 14, /**< Hard-swish */
+    } AclActivationType;
+
+    /**< Activation layer descriptor */
+    typedef struct
+    {
+        AclActivationType type;    /**< Activation type */
+        float             a;       /**< Factor &alpha used by some activations */
+        float             b;       /**< Factor &beta used by some activations */
+        bool              inplace; /**< Hint that src and dst tensors will be the same */
+    } AclActivationDescriptor;
+#ifdef __cplusplus
+}
+#endif /** __cplusplus */
+#endif /* ARM_COMPUTE_ACL_DESCRIPTORS_H_ */
diff --git a/arm_compute/AclEntrypoints.h b/arm_compute/AclEntrypoints.h
new file mode 100644
index 0000000000..0d4902a3d5
--- /dev/null
+++ b/arm_compute/AclEntrypoints.h
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_ENTRYPOINTS_H_
+#define ARM_COMPUTE_ACL_ENTRYPOINTS_H_
+
+#include "arm_compute/AclTypes.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /** __cplusplus */
+
+    /** Create a context object
+ *
+ * Context is responsible for retaining internal information and work as an aggregate service mechanism
+ *
+ * @param[in, out] ctx     A valid non-zero context object if no failure occurs
+ * @param[in]      target  Target to create the context for
+ * @param[in]      options Context options to be used for all the kernels that are created under the context
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclUnsupportedTarget if the requested target is unsupported
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclCreateContext(AclContext *ctx, AclTarget target, const AclContextOptions *options);
+
+    /** Destroy a given context object
+ *
+ * @param[in] ctx A valid context object to destroy
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided context is invalid
+ */
+    AclStatus AclDestroyContext(AclContext ctx);
+
+    /** Create an operator queue
+ *
+ * Queue is responsible for any scheduling related activities
+ *
+ * @param[in, out] queue   A valid non-zero queue object is not failures occur
+ * @param[in]      ctx     Context to be used
+ * @param[in]      options Queue options to be used for the operators using the queue
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclUnsupportedTarget if the requested target is unsupported
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions *options);
+
+    /** Wait until all elements on the queue have been completed
+ *
+ * @param[in] queue Queue to wait on completion
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided queue is invalid
+ *  - @ref AclRuntimeError on any other runtime related error
+ */
+    AclStatus AclQueueFinish(AclQueue queue);
+
+    /** Destroy a given queue object
+ *
+ * @param[in] queue A valid context object to destroy
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided context is invalid
+ */
+    AclStatus AclDestroyQueue(AclQueue queue);
+
+    /** Create a Tensor object
+ *
+ * Tensor is a generalized matrix construct that can represent up to ND dimensionality (where N = 6 for Compute Library)
+ * The object holds a backing memory along-side to operate on
+ *
+ * @param[in, out] tensor   A valid non-zero tensor object if no failures occur
+ * @param[in]      ctx      Context to be used
+ * @param[in]      desc     Tensor representation meta-data
+ * @param[in]      allocate Instructs allocation of the tensor objects
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclUnsupportedTarget if the requested target is unsupported
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclCreateTensor(AclTensor *tensor, AclContext ctx, const AclTensorDescriptor *desc, bool allocate);
+
+    /** Map a tensor's backing memory to the host
+ *
+ * @param[in]      tensor Tensor to be mapped
+ * @param[in, out] handle A handle to the underlying backing memory
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclMapTensor(AclTensor tensor, void **handle);
+
+    /** Unmap the tensor's backing memory
+ *
+ * @param[in] tensor tensor to unmap memory from
+ * @param[in] handle Backing memory to be unmapped
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclUnmapTensor(AclTensor tensor, void *handle);
+
+    /** Import external memory to a given tensor object
+ *
+ * @param[in, out] tensor Tensor to import memory to
+ * @param[in]      handle Backing memory to be imported
+ * @param[in]      type   Type of the imported memory
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclTensorImport(AclTensor tensor, void *handle, AclImportMemoryType type);
+
+    /** Destroy a given tensor object
+ *
+ * @param[in,out] tensor A valid tensor object to be destroyed
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclInvalidArgument if the provided tensor is invalid
+ */
+    AclStatus AclDestroyTensor(AclTensor tensor);
+
+    /** Creates a tensor pack
+ *
+ * Tensor packs are used to create a collection of tensors that can be passed around for operator execution
+ *
+ * @param[in,out] pack A valid non-zero tensor pack object if no failures occur
+ * @param[in]     ctx  Context to be used
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclCreateTensorPack(AclTensorPack *pack, AclContext ctx);
+
+    /** Add a tensor to a tensor pack
+ *
+ * @param[in,out] pack    Pack to append a tensor to
+ * @param[in]     tensor  Tensor to pack
+ * @param[in]     slot_id Slot of the operator that the tensors corresponds to
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclPackTensor(AclTensorPack pack, AclTensor tensor, int32_t slot_id);
+
+    /** A list of tensors to a tensor pack
+ *
+ * @param[in,out] pack        Pack to append the tensors to
+ * @param[in]     tensors     Tensors to append to the pack
+ * @param[in]     slot_ids    Slot IDs of each tensors to the operators
+ * @param[in]     num_tensors Number of tensors that are passed
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclPackTensors(AclTensorPack pack, AclTensor *tensors, int32_t *slot_ids, size_t num_tensors);
+
+    /** Destroy a given tensor pack object
+ *
+ * @param[in,out] pack A valid tensor pack object to destroy
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided context is invalid
+ */
+    AclStatus AclDestroyTensorPack(AclTensorPack pack);
+
+    /** Eager execution of a given operator on a list of inputs and outputs
+ *
+ * @param[in]     op      Operator to execute
+ * @param[in]     queue   Queue to schedule the operator on
+ * @param[in,out] tensors A list of input and outputs tensors to execute the operator on
+ *
+ * @return Status Code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclUnsupportedTarget if the requested target is unsupported
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ *  - @ref AclRuntimeError on any other runtime related error
+ */
+    AclStatus AclRunOperator(AclOperator op, AclQueue queue, AclTensorPack tensors);
+
+    /** Destroy a given operator object
+ *
+ * @param[in,out] op A valid operator object to destroy
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided context is invalid
+ */
+    AclStatus AclDestroyOperator(AclOperator op);
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* ARM_COMPUTE_ACL_ENTRYPOINTS_H_ */
diff --git a/arm_compute/AclOpenClExt.h b/arm_compute/AclOpenClExt.h
new file mode 100644
index 0000000000..28e918d371
--- /dev/null
+++ b/arm_compute/AclOpenClExt.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_OPENCL_EXT_H_
+#define ARM_COMPUTE_ACL_OPENCL_EXT_H_
+
+#include "arm_compute/AclTypes.h"
+
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 300
+#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+#endif /* CL_TARGET_OPENCL_VERSION */
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Weffc++"
+#include "include/CL/cl.h"
+#pragma GCC diagnostic pop
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+    /** Extract the underlying OpenCL context used by a given Compute Library context object
+ *
+ * @note @ref AclContext should be of an OpenCL backend target
+ *
+ * @param[in]  ctx            A valid non-zero context
+ * @param[out] opencl_context Underlying OpenCL context used
+ *
+ * @return Status code
+ */
+    AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
+
+    /** Extract the underlying OpenCL device id used by a given Compute Library context object
+ *
+ * @note @ref AclContext should be of an OpenCL backend target
+ *
+ * @param[in]  ctx           A valid non-zero context
+ * @param[out] opencl_device Underlying OpenCL device used
+ *
+ * @return Status code
+ */
+    AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
+
+    /** Set the underlying OpenCL context to be used by a given Compute Library context object
+ *
+ * @note @ref AclContext should be of an OpenCL backend target
+ *
+ * @param[in]  ctx            A valid non-zero context object
+ * @param[out] opencl_context Underlying OpenCL context to be used
+ *
+ * @return Status code
+ */
+    AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
+
+    /** Extract the underlying OpenCL queue used by a given Compute Library queue object
+ *
+ * @note @ref AclQueue should be of an OpenCL backend target
+ * @note @ref AclQueue refcount should be 0, meaning not used by other objects
+ *
+ * @param[in]  queue        A valid non-zero queue
+ * @param[out] opencl_queue Underlying OpenCL queue used
+ *
+ * @return Status code
+ */
+    AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
+
+    /** Set the underlying OpenCL queue to be used by a given Compute Library queue object
+ *
+ * @note @ref AclQueue should be of an OpenCL backend target
+ * @note opecl_queue needs to be created from the same context that the AclContext that the queue will use
+ *
+ * @param[in]  queue        A valid non-zero queue object
+ * @param[out] opencl_queue Underlying OpenCL queue to be used
+ *
+ * @return Status code
+ */
+    AclStatus AclSetClQueue(AclQueue queue, cl_command_queue opencl_queue);
+
+    /** Extract the underlying OpenCL memory object by a given Compute Library tensor object
+ *
+ * @param[in]  tensor     A valid non-zero tensor
+ * @param[out] opencl_mem Underlyig OpenCL memory object
+ *
+ * @return Status code
+ */
+    AclStatus AclGetClMem(AclTensor tensor, cl_mem *opencl_mem);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* ARM_COMPUTE_ACL_OPENCL_EXT_H_ */
diff --git a/arm_compute/AclOperators.h b/arm_compute/AclOperators.h
new file mode 100644
index 0000000000..4f6f46e9c8
--- /dev/null
+++ b/arm_compute/AclOperators.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_OPERATORS_H_
+#define ARM_COMPUTE_ACL_OPERATORS_H_
+
+#include "arm_compute/AclDescriptors.h"
+#include "arm_compute/AclTypes.h"
+
+/** Used during an operator creation to validate its support */
+#define ARM_COMPUTE_VALIDATE_OPERATOR_SUPPORT ((AclOperator *)(size_t)-1)
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /** __cplusplus */
+
+    /** Create an activation operator
+ *
+ * Applies an activation function to a given tensor .
+ * Compute Library supports a wide list of activation functions @ref AclActivationType.
+ *
+ * A summarized table is the following:
+ *    Activation Function   |                                                    Mathematical Expression                                                               |
+ * -------------------------|------------------------------------------------------------------------------------------------------------------------------------------|
+ *         Identity         |                                                       \f$ f(x)= x \f$                                                                    |
+ *         Logistic         |                                               \f$ f(x) = \frac{1}{1 + e^{-x}} \f$                                                        |
+ *           Tanh           |                                              \f$ f(x) = a \cdot tanh(b \cdot x) \f$                                                      |
+ *           Relu           |                                                   \f$ f(x) = max(0,x) \f$                                                                |
+ *       Bounded Relu       |                                                \f$ f(x) = min(a, max(0,x)) \f$                                                           |
+ * Lower-Upper Bounded Relu |                                                \f$ f(x) = min(a, max(b,x)) \f$                                                           |
+ *        Leaky Relu        |     \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$        |
+ *         Soft Relu        |                                                    \f$ f(x)= log(1+e^x) \f$                                                              |
+ *         Soft Elu         | \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ |
+ *           Abs            |                                                      \f$ f(x)= |x| \f$                                                                   |
+ *         Square           |                                                      \f$ f(x)= x^2 \f$                                                                   |
+ *          Sqrt            |                                                   \f$ f(x) = \sqrt{x} \f$                                                                |
+ *         Linear           |                                                    \f$ f(x)= ax + b \f$                                                                  |
+ *       Hard Swish         |                                              \f$ f(x) = (x * relu6(x+3))/6 \f$                                                           |
+ *
+ * Backends:
+ *   - OpenCL: ClActivationLayer
+ *   - Cpu   : CpuActivationLayer
+ *
+ * @param[in, out] op   Operator construct to be created if creation was successful
+ * @param[in]      ctx  Context to be used for the creation of the operator
+ * @param[in]      src  Source tensor descriptor. Slot id: ACL_SRC
+ * @param[in]      dst  Destination tensor descriptor. Slot id: ACL_DST
+ * @param[in]      info Activation meta-data
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclUnsupportedTarget if operator for the requested target is unsupported
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+    AclStatus AclActivation(AclOperator                  *op,
+                            AclContext                    ctx,
+                            const AclTensorDescriptor    *src,
+                            const AclTensorDescriptor    *dst,
+                            const AclActivationDescriptor info);
+#ifdef __cplusplus
+}
+#endif /** __cplusplus */
+#endif /* ARM_COMPUTE_ACL_OPERATORS_H_ */
diff --git a/arm_compute/AclTypes.h b/arm_compute/AclTypes.h
new file mode 100644
index 0000000000..9a002ad22c
--- /dev/null
+++ b/arm_compute/AclTypes.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_TYPES_H_
+#define ARM_COMPUTE_ACL_TYPES_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+    /**< Opaque Context object */
+    typedef struct AclContext_ *AclContext;
+    /**< Opaque Queue object */
+    typedef struct AclQueue_ *AclQueue;
+    /**< Opaque Tensor object */
+    typedef struct AclTensor_ *AclTensor;
+    /**< Opaque Tensor pack object */
+    typedef struct AclTensorPack_ *AclTensorPack;
+    /**< Opaque Operator object */
+    typedef struct AclOperator_ *AclOperator;
+
+    // Capabilities bitfield (Note: if multiple are enabled ComputeLibrary will pick the best possible)
+    typedef uint64_t AclTargetCapabilities;
+
+    /**< Error codes returned by the public entry-points */
+    typedef enum AclStatus : int32_t
+    {
+        AclSuccess            = 0, /**< Call succeeded, leading to valid state for all involved objects/data */
+        AclRuntimeError       = 1, /**< Call failed during execution */
+        AclOutOfMemory        = 2, /**< Call failed due to failure to allocate resources */
+        AclUnimplemented      = 3, /**< Call failed as requested capability is not implemented */
+        AclUnsupportedTarget  = 4, /**< Call failed as an invalid backend was requested */
+        AclInvalidTarget      = 5, /**< Call failed as invalid argument was passed */
+        AclInvalidArgument    = 6, /**< Call failed as invalid argument was passed */
+        AclUnsupportedConfig  = 7, /**< Call failed as configuration is unsupported */
+        AclInvalidObjectState = 8, /**< Call failed as an object has invalid state */
+    } AclStatus;
+
+    /**< Supported CPU targets */
+    typedef enum AclTarget
+    {
+        AclCpu    = 0, /**< Cpu target that uses SIMD extensions */
+        AclGpuOcl = 1, /**< OpenCL target for GPU */
+    } AclTarget;
+
+    /** Execution mode types */
+    typedef enum AclExecutionMode
+    {
+        AclPreferFastRerun = 0, /**< Prioritize performance when multiple iterations are performed */
+        AclPreferFastStart = 1, /**< Prioritize performance when a single iterations is expected to be performed */
+    } AclExecutionMode;
+
+    /** Available CPU capabilities */
+    typedef enum AclCpuCapabilities
+    {
+        AclCpuCapabilitiesAuto = 0, /**< Automatic discovery of capabilities */
+
+        AclCpuCapabilitiesNeon = (1 << 0), /**< Enable NEON optimized paths */
+        AclCpuCapabilitiesSve  = (1 << 1), /**< Enable SVE optimized paths */
+        AclCpuCapabilitiesSve2 = (1 << 2), /**< Enable SVE2 optimized paths */
+        // Reserve 3, 4, 5, 6
+
+        AclCpuCapabilitiesFp16 = (1 << 7), /**< Enable float16 data-type support */
+        AclCpuCapabilitiesBf16 = (1 << 8), /**< Enable bfloat16 data-type support */
+        // Reserve 9, 10, 11, 12
+
+        AclCpuCapabilitiesDot      = (1 << 13), /**< Enable paths that use the udot/sdot instructions */
+        AclCpuCapabilitiesMmlaInt8 = (1 << 14), /**< Enable paths that use the mmla integer instructions */
+        AclCpuCapabilitiesMmlaFp   = (1 << 15), /**< Enable paths that use the mmla float instructions */
+
+        AclCpuCapabilitiesAll = ~0 /**< Enable all paths */
+    } AclCpuCapabilities;
+
+    /**< Allocator interface that can be passed to a context */
+    typedef struct AclAllocator
+    {
+        /** Allocate a block of size bytes of memory.
+     *
+     * @param[in] user_data User provided data that can be used by the allocator
+     * @param[in] size      Size of the allocation
+     *
+     * @return A pointer to the allocated block if successfull else NULL
+     */
+        void *(*alloc)(void *user_data, size_t size);
+        /** Release a block of size bytes of memory.
+     *
+     * @param[in] user_data User provided data that can be used by the allocator
+     * @param[in] size      Size of the allocation
+     */
+        void (*free)(void *user_data, void *ptr);
+        /** Allocate a block of size bytes of memory.
+     *
+     * @param[in] user_data User provided data that can be used by the allocator
+     * @param[in] size      Size of the allocation
+     *
+     * @return A pointer to the allocated block if successfull else NULL
+     */
+        void *(*aligned_alloc)(void *user_data, size_t size, size_t alignment);
+        /** Allocate a block of size bytes of memory.
+     *
+     * @param[in] user_data User provided data that can be used by the allocator
+     * @param[in] size      Size of the allocation
+     */
+        void (*aligned_free)(void *user_data, void *ptr);
+
+        /**< User provided information */
+        void *user_data;
+    } AclAllocator;
+
+    /**< Context options */
+    typedef struct AclContextOptions
+    {
+        AclExecutionMode      mode;               /**< Execution mode to use */
+        AclTargetCapabilities capabilities;       /**< Target capabilities */
+        bool                  enable_fast_math;   /**< Allow precision loss */
+        const char           *kernel_config_file; /**< Kernel cofiguration file */
+        int32_t       max_compute_units; /**< Max compute units that can be used by a queue created from the context.
+                                                   If <=0 the system will use the hw concurency insted */
+        AclAllocator *allocator;         /**< Allocator to be used by all the memory internally */
+    } AclContextOptions;
+
+    /**< Supported tuning modes */
+    typedef enum
+    {
+        AclTuningModeNone = 0, /**< No tuning */
+        AclRapid          = 1, /**< Fast tuning mode, testing a small portion of the tuning space */
+        AclNormal         = 2, /**< Normal tuning mode, gives a good balance between tuning mode and performance */
+        AclExhaustive     = 3, /**< Exhaustive tuning mode, increased tuning time but with best results */
+    } AclTuningMode;
+
+    /**< Queue options */
+    typedef struct
+    {
+        AclTuningMode mode;          /**< Tuning mode */
+        int32_t       compute_units; /**< Compute Units that the queue will deploy */
+    } AclQueueOptions;
+
+    /**< Supported data types */
+    typedef enum AclDataType
+    {
+        AclDataTypeUnknown = 0, /**< Unknown data type */
+        AclUInt8           = 1, /**< 8-bit unsigned integer */
+        AclInt8            = 2, /**< 8-bit signed integer */
+        AclUInt16          = 3, /**< 16-bit unsigned integer */
+        AclInt16           = 4, /**< 16-bit signed integer */
+        AclUint32          = 5, /**< 32-bit unsigned integer */
+        AclInt32           = 6, /**< 32-bit signed integer */
+        AclFloat16         = 7, /**< 16-bit floating point */
+        AclBFloat16        = 8, /**< 16-bit brain floating point */
+        AclFloat32         = 9, /**< 32-bit floating point */
+    } AclDataType;
+
+    /**< Supported data layouts for operations */
+    typedef enum AclDataLayout
+    {
+        AclDataLayoutUnknown = 0, /**< Unknown data layout */
+        AclNhwc              = 1, /**< Native, performant, Compute Library data layout */
+        AclNchw              = 2, /**< Data layout where width is the fastest changing dimension */
+    } AclDataLayout;
+
+    /** Type of memory to be imported */
+    typedef enum AclImportMemoryType
+    {
+        AclHostPtr = 0 /**< Host allocated memory */
+    } AclImportMemoryType;
+
+    /**< Tensor Descriptor */
+    typedef struct AclTensorDescriptor
+    {
+        int32_t     ndims;     /**< Number or dimensions */
+        int32_t    *shape;     /**< Tensor Shape */
+        AclDataType data_type; /**< Tensor Data type */
+        int64_t    *strides;   /**< Strides on each dimension. Linear memory is assumed if nullptr */
+        int64_t     boffset;   /**< Offset in terms of bytes for the first element */
+    } AclTensorDescriptor;
+
+    /**< Slot type of a tensor */
+    typedef enum
+    {
+        AclSlotUnknown = -1,
+        AclSrc         = 0,
+        AclSrc0        = 0,
+        AclSrc1        = 1,
+        AclDst         = 30,
+        AclSrcVec      = 256,
+    } AclTensorSlot;
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* ARM_COMPUTE_ACL_TYPES_H_ */
diff --git a/arm_compute/runtime/Utils.h b/arm_compute/AclUtils.h
index 9a5b20eb26..61a93e6060 100644
--- a/arm_compute/runtime/Utils.h
+++ b/arm_compute/AclUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,44 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_UTILS_H
-#define ARM_COMPUTE_RUNTIME_UTILS_H
+#ifndef ARM_COMPUTE_ACL_UTILS_H_
+#define ARM_COMPUTE_ACL_UTILS_H_
 
-#include "arm_compute/runtime/IRuntimeContext.h"
-#include "arm_compute/runtime/Scheduler.h"
+#include "arm_compute/AclTypes.h"
 
-#include <string>
-
-namespace arm_compute
+#ifdef __cplusplus
+extern "C"
 {
-/** Convert a Scheduler::Type into a string.
+#endif /** __cplusplus */
+
+    /** Get the size of the existing tensor in byte
  *
- * @param[in] t @ref Scheduler::Type to be translated to string.
+ * @note The size isn't based on allocated memory, but based on information in its descriptor (dimensions, data type, etc.).
  *
- * @return The string describing the scheduler type.
- */
-const std::string &string_from_scheduler_type(Scheduler::Type t);
-
-/** Schedules a kernel using the context if not nullptr else uses the legacy scheduling flow.
+ * @param[in]  tensor A tensor in interest
+ * @param[out] size   The size of the tensor
+ *
+ * @return Status code
  *
- * @param[in] ctx    Context to use.
- * @param[in] kernel Kernel to schedule.
- * @param[in] hints  Hints to use.
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclInvalidArgument if a given argument is invalid
  */
-void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints);
+    AclStatus AclGetTensorSize(AclTensor tensor, uint64_t *size);
 
-/** Calculate number of stages for parallel implementations
+    /** Get the descriptor of this tensor
  *
- * @param[in] input_x_dimension input tensor x dimension
- * @param[in] axis              axis to be used
+ * @param[in]  tensor A tensor in interest
+ * @param[out] desc   The descriptor of the tensor
+ *
+ * @return Status code
+ *
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclInvalidArgument if a given argument is invalid
  */
-unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis);
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_UTILS_H */
+    AclStatus AclGetTensorDescriptor(AclTensor tensor, AclTensorDescriptor *desc);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* ARM_COMPUTE_ACL_UTILS_H_ */
diff --git a/arm_compute/AclVersion.h b/arm_compute/AclVersion.h
new file mode 100644
index 0000000000..6eed13b924
--- /dev/null
+++ b/arm_compute/AclVersion.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ACL_VERSION_H_
+#define ARM_COMPUTE_ACL_VERSION_H_
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif /* __cplusplus */
+
+    /** Semantic versioning information */
+    typedef struct AclVersion
+    {
+        int         major;      /**< Major version, is increased on API incompatible changes */
+        int         minor;      /**< Minor version, is increased on adding back-ward compatible functionality */
+        int         patch;      /**< Patch version, is increased when doing backward compatible fixes */
+        const char *build_info; /**< Build related information */
+    } AclVersion;
+
+/**< Major version, is increased on API incompatible changes */
+#define ARM_COMPUTE_LIBRARY_VERSION_MAJOR 0
+/**< Minor version, is increased on adding back-ward compatible functionality */
+#define ARM_COMPUTE_LIBRARY_VERSION_MINOR 1
+/**< Patch version, is increased when doing backward compatible fixes */
+#define ARM_COMPUTE_LIBRARY_VERSION_PATCH 0
+
+    /** Get library's version meta-data
+ *
+ * @return Version information
+ */
+    const AclVersion *AclVersionInfo();
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* ARM_COMPUTE_ACL_VERSION_H_ */
diff --git a/arm_compute/BUILD.bazel b/arm_compute/BUILD.bazel
new file mode 100644
index 0000000000..d1219015b7
--- /dev/null
+++ b/arm_compute/BUILD.bazel
@@ -0,0 +1,54 @@
+# Copyright (c) 2023 Arm Limited.
+#
+# SPDX-License-Identifier: MIT
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to
+# deal in the Software without restriction, including without limitation the
+# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+# sell copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+cc_library(
+    name = "core_headers",
+    hdrs = glob([
+        "core/**/*.h",
+        "core/**/*.inl",
+        "*.h",
+        "*.hpp",
+        "dynamic_fusion/**/*.h",
+        "function_info/**/*.h",
+    ]),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "runtime_headers",
+    hdrs = glob(["runtime/**/*.h"]),
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "graph_headers",
+    hdrs = glob(
+        [
+            "graph/**/*.h",
+            "graph.h",
+        ],
+        exclude = glob([
+            "graph/backends/CL/*.h",
+        ]),
+    ),
+    visibility = ["//visibility:public"],
+)
diff --git a/arm_compute/core/AccessWindowAutoPadding.h b/arm_compute/core/AccessWindowAutoPadding.h
deleted file mode 100644
index 8a182c6eb4..0000000000
--- a/arm_compute/core/AccessWindowAutoPadding.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H
-#define ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class Window;
-class ITensorInfo;
-
-/** Dummy access window.
- *
- * This implementation always uses the auto padding of the tensor info and
- * never updates the window. The valid region is always set to cover the entire
- * tensor.
- *
- * @note This access window is only used during the migration to the new
- *       padding system. It will be removed once all kernels have been ported.
- *
- * */
-class AccessWindowAutoPadding : public IAccessWindow
-{
-public:
-    /** Default constructor.
-     *
-     * @param[in,out] info Tensor info of the accessed kernel.
-     */
-    AccessWindowAutoPadding(ITensorInfo *info);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowAutoPadding(const AccessWindowAutoPadding &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowAutoPadding &operator=(const AccessWindowAutoPadding &) = delete;
-    /** Allow instances of this class to be move constructed */
-    AccessWindowAutoPadding(AccessWindowAutoPadding &&) = default;
-    /** Allow instances of this class to be moved */
-    AccessWindowAutoPadding &operator=(AccessWindowAutoPadding &&) = default;
-    /** Default destructor */
-    ~AccessWindowAutoPadding() = default;
-
-    /** Set the valid region to match the entire tensor. */
-    void set_valid_region();
-
-    /** Return a valid region that spans across the entire tensor.
-     *
-     * @return a valid region.
-     *
-     */
-    ValidRegion compute_valid_region() const;
-
-    // Inherited methods overridden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
-
-private:
-    ITensorInfo *_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ACCESS_WINDOW_AUTO_PADDING_H*/
diff --git a/arm_compute/core/AccessWindowStatic.h b/arm_compute/core/AccessWindowStatic.h
deleted file mode 100644
index e40c188fcd..0000000000
--- a/arm_compute/core/AccessWindowStatic.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IACCESS_WINDOW_STATIC_H
-#define ARM_COMPUTE_IACCESS_WINDOW_STATIC_H
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class Window;
-class ITensorInfo;
-
-/** Implementation of a static rectangular access pattern.
- *
- * In this implementation the access offsets and sizes are not relative to the
- * current element. Instead they are considered to be absolute coordinates
- * within the accessed tensor's shape.
- *
- * */
-class AccessWindowStatic : public IAccessWindow
-{
-public:
-    /** Constructor for a static access pattern.
-     *
-     * @param[in,out] info    Tensor info of the accessed kernel.
-     * @param[in]     start_x Start of the access in X direction.
-     * @param[in]     start_y Start of the access in Y direction.
-     * @param[in]     end_x   End of the access in X direction.
-     * @param[in]     end_y   End of the access in Y direction.
-     */
-    AccessWindowStatic(ITensorInfo *info, int start_x, int start_y, int end_x, int end_y);
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowStatic(const AccessWindowStatic &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    AccessWindowStatic &operator=(const AccessWindowStatic &) = delete;
-    /** Allow instances of this class to be move constructed */
-    AccessWindowStatic(AccessWindowStatic &&) = default;
-    /** Allow instances of this class to be moved */
-    AccessWindowStatic &operator=(AccessWindowStatic &&) = default;
-    /** Default destructor */
-    ~AccessWindowStatic() = default;
-
-    /** Set the valid region based on the static access pattern and valid
-     *  region of the inputs.
-     *
-     * @param[in] window             Execution window of the kernel.
-     * @param[in] input_valid_region Combined valid region of all inputs.
-     */
-    void set_valid_region(const Window &window, const ValidRegion &input_valid_region);
-
-    /** Compute the valid region based on the static access pattern and valid region of the inputs.
-     *
-     * @param[in] window             Execution window of the kernel.
-     * @param[in] input_valid_region Combined valid region of all inputs.
-     *
-     * @return a valid region.
-     *
-     */
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
-
-    // Inherited methods overriden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
-
-private:
-    ITensorInfo *_info;
-    int          _start_x;
-    int          _start_y;
-    int          _end_x;
-    int          _end_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_IACCESS_WINDOW_STATIC_H*/
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 2b6d8cd2cb..dcd3b45670 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -77,6 +77,8 @@ public:
      */
     const StringSet &options() const;
 
+    bool operator==(const CLBuildOptions &other) const;
+
 private:
     StringSet _build_opts; /**< Build options set */
 };
@@ -118,6 +120,14 @@ public:
     {
         return _name;
     }
+    /** Returns program binary data.
+     *
+     * @return Program's binary data.
+     */
+    const std::vector<unsigned char> &binary() const
+    {
+        return _binary;
+    }
     /** User-defined conversion to the underlying CL program.
      *
      * @return The CL program object.
@@ -240,8 +250,12 @@ public:
      *
      * @return The created kernel.
      */
-    Kernel create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                         const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const;
+    Kernel create_kernel(const std::string &kernel_name,
+                         const std::string &program_name,
+                         const std::string &program_source,
+                         const std::string &kernel_path,
+                         const StringSet   &build_options_set,
+                         bool               is_binary) const;
 
     /** Clear the library's cache of binary programs
      */
@@ -288,6 +302,24 @@ public:
      */
     bool int64_base_atomics_supported() const;
 
+    /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+    *
+    * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+    */
+    bool is_wbsm_supported() const;
+
+    /** Return the DDK version. If the DDK version cannot be detected, return -1.
+     *
+     * @return The DDK version.
+     */
+    int32_t get_ddk_version() const;
+
+    /** Return the Gpu target of the associated device
+     *
+     * @return GPUTarget
+     */
+    GPUTarget get_gpu_target() const;
+
 private:
     /** Load program and its dependencies.
      *
@@ -295,7 +327,8 @@ private:
      * @param[in] program_source Source of the program.
      * @param[in] is_binary      Flag to indicate if the program source is binary.
      */
-    const Program &load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
+    const Program &
+    load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const;
 
     /** Generates the build options given a string of user defined ones
      *
@@ -315,10 +348,11 @@ private:
      */
     std::string stringify_set(const StringSet &s, const std::string &kernel_path) const;
 
-    cl::Context _context;                                             /**< Underlying CL context. */
-    CLDevice    _device;                                              /**< Underlying CL device. */
+    cl::Context                                  _context;            /**< Underlying CL context. */
+    CLDevice                                     _device;             /**< Underlying CL device. */
     mutable std::map<std::string, const Program> _programs_map;       /**< Map with all already loaded program data. */
     mutable std::map<std::string, cl::Program>   _built_programs_map; /**< Map with all already built program data. */
+    bool _is_wbsm_supported; /**< Support of worksize batch size modifier support boolean*/
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */
diff --git a/arm_compute/core/CL/CLCoreRuntimeContext.h b/arm_compute/core/CL/CLCoreRuntimeContext.h
deleted file mode 100644
index 2b2269dece..0000000000
--- a/arm_compute/core/CL/CLCoreRuntimeContext.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCORERUNTIME_CONTEXT_H
-#define ARM_COMPUTE_CLCORERUNTIME_CONTEXT_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class CLKernelLibrary;
-
-/** Core runtime context for OpenCL */
-class CLCoreRuntimeContext final
-{
-public:
-    /** Legacy constructor */
-    CLCoreRuntimeContext();
-
-    /** Constructor */
-    CLCoreRuntimeContext(CLKernelLibrary *kernel_lib, cl::Context ctx, cl::CommandQueue queue);
-    /** Destructor */
-    ~CLCoreRuntimeContext() = default;
-    /** Default copy constructor */
-    CLCoreRuntimeContext(const CLCoreRuntimeContext &) = default;
-    /** Default move constructor */
-    CLCoreRuntimeContext(CLCoreRuntimeContext &&) = default;
-    /** Default copy assignment */
-    CLCoreRuntimeContext &operator=(const CLCoreRuntimeContext &) = default;
-    /** Default move assignment operator */
-    CLCoreRuntimeContext &operator=(CLCoreRuntimeContext &&) = default;
-    /** Kernel Library accessor
-     *
-     * @return The kernel library instance used by the core context
-     */
-    CLKernelLibrary *kernel_library() const;
-    /** OpenCL context accessor
-     *
-     * @return The OpenCL context used by the core context
-     */
-    cl::Context context();
-    /** OpenCL command queue accessor
-     *
-     * @return The OpenCL queue used by the core context
-     */
-    cl::CommandQueue queue();
-
-private:
-    CLKernelLibrary *_kernel_lib{ nullptr };
-    cl::Context      _ctx{};
-    cl::CommandQueue _queue{};
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCORERUNTIME_CONTEXT_H */
diff --git a/arm_compute/core/CL/CLDevice.h b/arm_compute/core/CL/CLDevice.h
index 812834743d..ded6bb8493 100644
--- a/arm_compute/core/CL/CLDevice.h
+++ b/arm_compute/core/CL/CLDevice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include "arm_compute/core/IDevice.h"
 
 #include <set>
+#include <sstream>
 #include <string>
 
 namespace arm_compute
@@ -43,8 +44,7 @@ class CLDevice : public IDevice
 {
 public:
     /** Default Constructor */
-    CLDevice()
-        : _device(cl::Device()), _options()
+    CLDevice() : _device(cl::Device()), _options()
     {
     }
 
@@ -52,8 +52,7 @@ public:
      *
      * @param[in] cl_device OpenCL device
      */
-    CLDevice(const cl::Device &cl_device)
-        : _device(), _options()
+    CLDevice(const cl::Device &cl_device) : _device(), _options()
     {
         _device = cl_device;
 
@@ -65,13 +64,13 @@ public:
         std::string extensions = _device.getInfo<CL_DEVICE_EXTENSIONS>();
 
         std::istringstream iss(extensions);
-        for(std::string s; iss >> s;)
+        for (std::string s; iss >> s;)
         {
             _options.extensions.insert(s);
         }
 
         // SW workaround for G76
-        if(_options.gpu_target == GPUTarget::G76)
+        if (_options.gpu_target == GPUTarget::G76)
         {
             _options.extensions.insert("cl_arm_integer_dot_product_int8");
         }
@@ -142,6 +141,32 @@ public:
         return _options.extensions.count(extension) != 0;
     }
 
+    /** Returns whether non-uniform workgroup is supported and the build options.
+     *
+     * If the feature is supported, the appropriate build options will be
+     * appended to the specified string.
+     *
+     * @return A tuple (supported, build_options) indicating whether the feature
+     *         is supported and the corresponding build options to enable it.
+     */
+    std::tuple<bool, std::string> is_non_uniform_workgroup_supported() const
+    {
+        if (version() == CLVersion::CL30 && get_cl_non_uniform_work_group_supported(_device))
+        {
+            return {true, " -cl-std=CL3.0 "};
+        }
+        else if (version() == CLVersion::CL20)
+        {
+            return {true, " -cl-std=CL2.0 "};
+        }
+        else if (supported("cl_arm_non_uniform_work_group_size"))
+        {
+            return {true, " -cl-arm-non-uniform-work-group-size "};
+        }
+
+        return {false, ""};
+    }
+
 private:
     cl::Device             _device;  /**< OpenCL device. */
     struct CLDeviceOptions _options; /**< OpenCL device options */
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index fc3f4d5db0..1a639e47f9 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,13 +26,13 @@
 
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Types.h"
 
 #include <set>
 #include <string>
 
 namespace arm_compute
 {
-class CLCoreRuntimeContext;
 class CLCompileContext;
 class CLBuildOptions;
 
@@ -41,6 +41,9 @@ enum class DataType;
 /** Max vector width of an OpenCL vector */
 static constexpr unsigned int max_cl_vector_width = 16;
 
+/** Max number of manual loop unrolling */
+static constexpr int max_manual_loop_unrolling = 128;
+
 /** Translates a tensor data type to the appropriate OpenCL type.
  *
  * @param[in] dt @ref DataType to be translated to OpenCL type.
@@ -97,14 +100,6 @@ std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt);
  */
 std::string get_data_size_from_data_type(const DataType &dt);
 
-/** Translates fixed point tensor data type to the underlying OpenCL type.
- *
- * @param[in] dt @ref DataType to be translated to OpenCL type.
- *
- * @return The string specifying the underlying OpenCL type to be used.
- */
-std::string get_underlying_cl_type_from_data_type(const DataType &dt);
-
 /** Helper function to get the GPU target from CL device
  *
  * @param[in] device A CL device
@@ -129,6 +124,14 @@ CLVersion get_cl_version(const cl::Device &device);
  */
 size_t get_cl_image_pitch_alignment(const cl::Device &device);
 
+/** Helper function to check whether non-uniform work group is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the feature is supported
+ */
+bool get_cl_non_uniform_work_group_supported(const cl::Device &device);
+
 /** Helper function to check whether a given extension is supported
  *
  * @param[in] device         A CL device
@@ -176,7 +179,9 @@ bool dot8_acc_supported(const cl::Device &device);
  *
  * @return True if the configuration is supported
  */
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout);
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout);
 
 /** Helper function to get the preferred native vector width size for built-in scalar types that can be put into vectors
  *
@@ -204,16 +209,6 @@ bool preferred_dummy_work_items_support(const cl::Device &device);
  */
 bool image2d_from_buffer_supported(const cl::Device &device);
 
-/** Creates an opencl kernel
- *
- * @param[in] ctx         A context to be used to create the opencl kernel.
- * @param[in] kernel_name The kernel name.
- * @param[in] build_opts  The build options to be used for the opencl kernel compilation.
- *
- * @return An opencl kernel
- */
-cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &kernel_name, const CLBuildOptions &build_opts);
-
 /** Creates an opencl kernel using a compile context
  *
  * @param[in] ctx         A compile context to be used to create the opencl kernel.
@@ -222,7 +217,9 @@ cl::Kernel create_opencl_kernel(CLCoreRuntimeContext *ctx, const std::string &ke
  *
  * @return An opencl kernel
  */
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts = std::set<std::string>());
+cl::Kernel create_kernel(const CLCompileContext      &ctx,
+                         const std::string           &kernel_name,
+                         const std::set<std::string> &build_opts = std::set<std::string>());
 
 /** Creates a suitable LWS hint object for parallel implementations. Sets the number of WG based on the input size.
  *  If input width is smaller than 128 we can use fewer threads than 8.
@@ -234,5 +231,62 @@ cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_
  */
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size);
 
+/* Helper function to check if the workgroup batch size modifier parameter is supported on the cl device
+ *
+ * @param[in] device cl device to check for support
+ *
+ * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+ */
+bool get_wbsm_support_info(const cl::Device &device);
+
+/* Helper function to set the workgroup batch size modifier parameter in the kernel
+ *
+ * @param[in] kernel    cl kernel to set the workgroup batch size modifier parameter
+ * @param[in] wbsm_hint workgroup batch size modifier to use
+ */
+void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint);
+
+/* Helper function to check if we can export the tensor to cl_image
+ *
+ * @param[in] input tensor
+ *
+ * @return true if we can export the tensor to cl_image
+ */
+bool export_to_cl_image(const ITensorInfo *tensor);
+
+/* Helper function to force unroll with pragma when any of the input values (iterations) are greater than @ref max_manual_loop_unrolling
+ *
+ * This function passes UNROLL_WITH_PRAGMA at compile time when any of the input values are greater than @ref max_manual_loop_unrolling
+ *
+ * @param[in] built_opts OpenCL kernel build options
+ * @param[in] values     Input values (iterations)
+ *
+ */
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values);
+
+/** Helper function to check whether the cl_arm_matrix_multiply extension is supported
+ *
+ * @param[in] device A CL device
+ *
+ * @return True if the extension is supported
+ */
+bool arm_matrix_multiply_supported(const cl::Device &device);
+
+/** Check whether cl_khr_command_buffer extension is supported by the specified CL device.
+ *
+ * @param[in] device The CL device
+ *
+ * @return True if the extension is supported by the CL device.
+ */
+bool command_buffer_supported(const cl::Device &device);
+
+/** Check whether cl_khr_command_buffer_mutable_dispatch extension is supported by the specified CL device.
+ *
+ * @param[in] device The CL device
+ *
+ * @return True if the extension is supported by the CL device.
+ */
+bool command_buffer_mutable_dispatch_supported(const cl::Device &device);
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index 6c5df6cb08..527733ccf1 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@ private:
 
 public:
     /** Access the KernelLibrary singleton.
-     * This method has been deprecated and will be removed in the next release.
+     * This method has been deprecated and will be removed in future releases
      * @return The KernelLibrary instance.
      */
     static CLKernelLibrary &get();
@@ -148,6 +148,12 @@ public:
      */
     std::string get_program_name(const std::string &kernel_name) const;
 
+    /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+    *
+    * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+    */
+    bool is_wbsm_supported();
+
     /** Sets the CL context used to create programs.
      *
      * @note Setting the context also resets the device to the
@@ -164,11 +170,7 @@ public:
     CLCompileContext &get_compile_context();
 
 private:
-    CLCompileContext _compile_context;                                   /**< Compile Context. */
-    std::string      _kernel_path;                                       /**< Path to the kernels folder. */
-    static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
-                                                                            Used for compile-time kernel inclusion. >*/
+    CLCompileContext _compile_context; /**< Compile Context. */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLKERNELLIBRARY_H */
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
deleted file mode 100644
index cd26399390..0000000000
--- a/arm_compute/core/CL/CLKernels.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLKERNELS_H
-#define ARM_COMPUTE_CLKERNELS_H
-
-/* Header regrouping all the CL kernels */
-#include "arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/CL/kernels/CLAccumulateKernel.h"
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseAndKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseNotKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseOrKernel.h"
-#include "arm_compute/core/CL/kernels/CLBitwiseXorKernel.h"
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLBox3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelCombineKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelExtractKernel.h"
-#include "arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLColorConvertKernel.h"
-#include "arm_compute/core/CL/kernels/CLComparisonKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLDerivativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDilateKernel.h"
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLErodeKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGatherKernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLMedian3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
-#include "arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h"
-#include "arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLRangeKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLRemapKernel.h"
-#include "arm_compute/core/CL/kernels/CLReorgLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-#include "arm_compute/core/CL/kernels/CLScharr3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSelectKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel3x3Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLStridedSliceKernel.h"
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/core/CL/kernels/CLThresholdKernel.h"
-#include "arm_compute/core/CL/kernels/CLTileKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWarpAffineKernel.h"
-#include "arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLYOLOLayerKernel.h"
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-#endif /* ARM_COMPUTE_CLKERNELS_H */
diff --git a/arm_compute/core/CL/CLTypes.h b/arm_compute/core/CL/CLTypes.h
index 3643b178d3..0f088e2b10 100644
--- a/arm_compute/core/CL/CLTypes.h
+++ b/arm_compute/core/CL/CLTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,8 @@ enum class CLVersion
     CL10,   /* the OpenCL 1.0 */
     CL11,   /* the OpenCL 1.1 */
     CL12,   /* the OpenCL 1.2 */
-    CL20,   /* the OpenCL 2.0 and above */
+    CL20,   /* the OpenCL 2.x */
+    CL30,   /* the OpenCL 3.x */
     UNKNOWN /* unkown version */
 };
 
@@ -62,18 +63,27 @@ struct CLDeviceOptions
 struct CLQuantization
 {
     /** Default Constructor */
-    CLQuantization()
-        : scale(nullptr), offset(nullptr) {};
+    CLQuantization() : scale(nullptr), offset(nullptr){};
     /** Constructor
      *
      * @param[in] scale  OpenCL scale array
      * @param[in] offset OpenCL offset array
      */
-    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset)
-        : scale(scale), offset(offset) {};
+    CLQuantization(const ICLFloatArray *scale, const ICLInt32Array *offset) : scale(scale), offset(offset){};
 
     const ICLFloatArray *scale;  /**< Quantization scale array */
     const ICLInt32Array *offset; /**< Quantization offset array */
 };
+
+enum CLKernelType
+{
+    UNKNOWN,     /**< Unknown CL kernel type */
+    DEPTHWISE,   /**< Depthwise CL kernel type */
+    DIRECT,      /**< Direct Convolution CL kernel type */
+    ELEMENTWISE, /**< Elementwise CL kernel type */
+    GEMM,        /**< GEMM CL kernel type */
+    POOL,        /**< Pool CL kernel type */
+    WINOGRAD     /**< Winograd CL kernel type */
+};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_TYPES_H */
diff --git a/arm_compute/core/CL/CLValidate.h b/arm_compute/core/CL/CLValidate.h
deleted file mode 100644
index 8f1733dcfe..0000000000
--- a/arm_compute/core/CL/CLValidate.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_VALIDATE_H
-#define ARM_COMPUTE_CL_VALIDATE_H
-
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
-
-/** Return an error if int64_base_atomics extension is not supported by the device.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- *
- * @return Status
- */
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
-{
-    if(!CLKernelLibrary::get().int64_base_atomics_supported())
-    {
-        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
-    }
-    return arm_compute::Status{};
-}
-
-#define ARM_COMPUTE_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED() \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_int64_base_atomics(__func__, __FILE__, __LINE__));
-
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_VALIDATE_H */
diff --git a/arm_compute/core/CL/ICLArray.h b/arm_compute/core/CL/ICLArray.h
index e11fb95bf8..a2b2baa5b3 100644
--- a/arm_compute/core/CL/ICLArray.h
+++ b/arm_compute/core/CL/ICLArray.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,7 @@ public:
      * @param[in] max_num_values Maximum size of the array.
      *
      */
-    explicit ICLArray(size_t max_num_values)
-        : IArray<T>(max_num_values), _mapping(nullptr)
+    explicit ICLArray(size_t max_num_values) : IArray<T>(max_num_values), _mapping(nullptr)
     {
     }
 
@@ -66,8 +65,6 @@ public:
      * @param[in]     blocking If true, then the mapping will be ready to use by the time
      *                         this method returns, else it is the caller's responsibility
      *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
      */
     void map(cl::CommandQueue &q, bool blocking = true)
     {
@@ -115,14 +112,6 @@ private:
     uint8_t *_mapping;
 };
 
-/** Interface for OpenCL Array of Key Points. */
-using ICLKeyPointArray = ICLArray<KeyPoint>;
-/** Interface for OpenCL Array of 2D Coordinates. */
-using ICLCoordinates2DArray = ICLArray<Coordinates2D>;
-/** Interface for OpenCL Array of Detection Windows. */
-using ICLDetectionWindowArray = ICLArray<DetectionWindow>;
-/** Interface for OpenCL Array of 2D Sizes. */
-using ICLSize2DArray = ICLArray<Size2D>;
 /** Interface for OpenCL Array of uint8s. */
 using ICLUInt8Array = ICLArray<cl_uchar>;
 /** Interface for OpenCL Array of uint16s. */
@@ -135,5 +124,5 @@ using ICLInt16Array = ICLArray<cl_short>;
 using ICLInt32Array = ICLArray<cl_int>;
 /** Interface for OpenCL Array of floats. */
 using ICLFloatArray = ICLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLARRAY_H*/
diff --git a/arm_compute/core/CL/ICLDistribution1D.h b/arm_compute/core/CL/ICLDistribution1D.h
deleted file mode 100644
index a9bafe3d5a..0000000000
--- a/arm_compute/core/CL/ICLDistribution1D.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLDISTRIBUTION1D_H
-#define ARM_COMPUTE_ICLDISTRIBUTION1D_H
-
-#include "arm_compute/core/IDistribution1D.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-class CommandQueue;
-}
-
-namespace arm_compute
-{
-/** ICLDistribution1D interface class */
-class ICLDistribution1D : public IDistribution1D
-{
-public:
-    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
-     *               defined by a start offset and valid range, divided equally into num_bins parts.
-     *
-     * @param[in] num_bins The number of bins the distribution is divided in.
-     * @param[in] offset   The start of the values to use.
-     * @param[in] range    The total number of the consecutive values of the distribution interval.
-     */
-    ICLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDistribution1D(const ICLDistribution1D &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const ICLDistribution1D &operator=(const ICLDistribution1D &) = delete;
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    void map(cl::CommandQueue &q, bool blocking = true);
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    void unmap(cl::CommandQueue &q);
-    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the distribution's data.
-     *
-     * @return A reference to an OpenCL buffer containing the distribution's data.
-     */
-    virtual cl::Buffer &cl_buffer() = 0;
-    // Inherited methods overridden:
-    uint32_t *buffer() const override;
-
-protected:
-    /** Method to be implemented by the child class to map the OpenCL buffer
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint32_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the OpenCL buffer
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    virtual void do_unmap(cl::CommandQueue &q) = 0;
-
-protected:
-    uint32_t *_mapping; /**< The distribution data. */
-};
-}
-#endif /* ARM_COMPUTE_ICLDISTRIBUTION1D_H */
diff --git a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h b/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
deleted file mode 100644
index e5f4a78297..0000000000
--- a/arm_compute/core/CL/ICLGEMMKernelConfiguration.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H
-#define ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H
-
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-/** Basic interface for the GEMM kernel configuration */
-class ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] arch GPU target
-     */
-    ICLGEMMKernelConfiguration(GPUTarget arch)
-        : _target(arch)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLGEMMKernelConfiguration(const ICLGEMMKernelConfiguration &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLGEMMKernelConfiguration &operator=(const ICLGEMMKernelConfiguration &) = delete;
-    /** Default Move Constructor. */
-    ICLGEMMKernelConfiguration(ICLGEMMKernelConfiguration &&) = default;
-    /** Default move assignment operator */
-    ICLGEMMKernelConfiguration &operator=(ICLGEMMKernelConfiguration &&) = default;
-    /** Virtual destructor */
-    virtual ~ICLGEMMKernelConfiguration() = default;
-    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
-     *
-     * @param[in] m         Number of rows LHS matrix
-     * @param[in] n         Number of columns RHS matrix
-     * @param[in] k         Number of columns LHS matrix or number of rows RHS matrix
-     * @param[in] b         Batch size
-     * @param[in] data_type Data type
-     */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
-
-protected:
-    GPUTarget _target;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICLGEMMKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/ICLHOG.h b/arm_compute/core/CL/ICLHOG.h
deleted file mode 100644
index b42566ef11..0000000000
--- a/arm_compute/core/CL/ICLHOG.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLHOG_H
-#define ARM_COMPUTE_ICLHOG_H
-
-#include "arm_compute/core/IHOG.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-class CommandQueue;
-}
-
-namespace arm_compute
-{
-/** Interface for OpenCL HOG data-object */
-class ICLHOG : public IHOG
-{
-public:
-    /** Default constructor */
-    ICLHOG();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLHOG(const ICLHOG &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLHOG &operator=(const ICLHOG &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLHOG(ICLHOG &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLHOG &operator=(ICLHOG &&) = default;
-    /** Default destructor */
-    virtual ~ICLHOG() = default;
-
-    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the hog's descriptor
-     *
-     * @return A reference to an OpenCL buffer containing the hog's descriptor
-     */
-    virtual const cl::Buffer &cl_buffer() const = 0;
-
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
-     */
-    void map(cl::CommandQueue &q, bool blocking = true);
-
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    void unmap(cl::CommandQueue &q);
-
-    /** Interface to be implemented by the child class to free the allocated cl buffer.
-     *
-     * @warning The buffer must have been allocated previously. Otherwise calling the function will fail.
-     */
-    virtual void free() = 0;
-
-    // Inherited methods overridden:
-    float *descriptor() const override;
-
-protected:
-    /** Method to be implemented by the child class to map the OpenCL buffer
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the OpenCL buffer
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    virtual void do_unmap(cl::CommandQueue &q) = 0;
-
-private:
-    uint8_t *_mapping;
-};
-}
-#endif /*ARM_COMPUTE_ICLHOG_H */
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
deleted file mode 100644
index 3e545c61aa..0000000000
--- a/arm_compute/core/CL/ICLKernel.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLKERNEL_H
-#define ARM_COMPUTE_ICLKERNEL_H
-
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/IKernel.h"
-
-#include <string>
-
-namespace arm_compute
-{
-template <typename T>
-class ICLArray;
-class ICLTensor;
-class Window;
-
-/** Common interface for all the OpenCL kernels */
-class ICLKernel : public IKernel
-{
-private:
-    /** Returns the number of arguments enqueued per array object.
-     *
-     * @return The number of arguments enqueued per array object.
-     */
-    template <unsigned int        dimension_size>
-    constexpr static unsigned int num_arguments_per_array()
-    {
-        return num_arguments_per_tensor<dimension_size>();
-    }
-    /** Returns the number of arguments enqueued per tensor object.
-     *
-     * @return The number of arguments enqueued per tensor object.
-     */
-    template <unsigned int        dimension_size>
-    constexpr static unsigned int num_arguments_per_tensor()
-    {
-        return 2 + 2 * dimension_size;
-    }
-    using IKernel::configure; //Prevent children from calling IKernel::configure() directly
-protected:
-    /** Configure the kernel's window and local workgroup size hint.
-     *
-     * @param[in] window   The maximum window which will be returned by window()
-     * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
-     */
-    void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
-    {
-        _lws_hint = lws_hint;
-        IKernel::configure(window);
-    }
-
-public:
-    /** Constructor */
-    ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
-    {
-    }
-    /** Returns a reference to the OpenCL kernel of this object.
-     *
-     * @return A reference to the OpenCL kernel of this object.
-     */
-    cl::Kernel &kernel()
-    {
-        return _kernel;
-    }
-    /** Add the passed 1D array's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     array          Array to set as an argument of the object's kernel.
-     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
-     * @param[in]     num_dimensions Number of dimensions of the @p array.
-     * @param[in]     window         Window the kernel will be executed on.
-     */
-    template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
-    {
-        add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
-    }
-    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<1>(idx, tensor, window);
-    }
-    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
-     *
-     * @param[in]     cond   Condition to check
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        if(cond)
-        {
-            add_1D_tensor_argument(idx, tensor, window);
-        }
-    }
-    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_2D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<2>(idx, tensor, window);
-    }
-    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx if the condition is true.
-     *
-     * @param[in]     cond   Condition to check
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        if(cond)
-        {
-            add_2D_tensor_argument(idx, tensor, window);
-        }
-    }
-    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<3>(idx, tensor, window);
-    }
-    /** Add the passed 4D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    void add_4D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
-    {
-        add_tensor_argument<4>(idx, tensor, window);
-    }
-    /** Returns the number of arguments enqueued per 1D array object.
-     *
-     * @return The number of arguments enqueues per 1D array object.
-     */
-    constexpr static unsigned int num_arguments_per_1D_array()
-    {
-        return num_arguments_per_array<1>();
-    }
-    /** Returns the number of arguments enqueued per 1D tensor object.
-     *
-     * @return The number of arguments enqueues per 1D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_1D_tensor()
-    {
-        return num_arguments_per_tensor<1>();
-    }
-    /** Returns the number of arguments enqueued per 2D tensor object.
-     *
-     * @return The number of arguments enqueues per 2D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_2D_tensor()
-    {
-        return num_arguments_per_tensor<2>();
-    }
-    /** Returns the number of arguments enqueued per 3D tensor object.
-     *
-     * @return The number of arguments enqueues per 3D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_3D_tensor()
-    {
-        return num_arguments_per_tensor<3>();
-    }
-    /** Returns the number of arguments enqueued per 4D tensor object.
-     *
-     * @return The number of arguments enqueues per 4D tensor object.
-     */
-    constexpr static unsigned int num_arguments_per_4D_tensor()
-    {
-        return num_arguments_per_tensor<4>();
-    }
-    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
-     *
-     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
-     *
-     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     * @param[in,out] queue  Command queue on which to enqueue the kernel.
-     */
-    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
-    /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     value Value to set as an argument of the object's kernel.
-     */
-    template <typename T>
-    void add_argument(unsigned int &idx, T value)
-    {
-        _kernel.setArg(idx++, value);
-    }
-
-    /** Set the Local-Workgroup-Size hint
-     *
-     * @note This method should be called after the configuration of the kernel
-     *
-     * @param[in] lws_hint Local-Workgroup-Size to use
-     */
-    void set_lws_hint(const cl::NDRange &lws_hint)
-    {
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
-        _lws_hint = lws_hint;
-    }
-
-    /** Return the Local-Workgroup-Size hint
-     *
-     * @return Current lws hint
-     */
-    cl::NDRange lws_hint() const
-    {
-        return _lws_hint;
-    }
-
-    /** Get the configuration ID
-     *
-     * @note The configuration ID can be used by the caller to distinguish different calls of the same OpenCL kernel
-     *       In particular, this method can be used by CLScheduler to keep track of the best LWS for each configuration of the same kernel.
-     *       The configuration ID should be provided only for the kernels potentially affected by the LWS geometry
-     *
-     * @note This method should be called after the configuration of the kernel
-     *
-     * @return configuration id string
-     */
-    const std::string &config_id() const
-    {
-        return _config_id;
-    }
-
-    /** Set the targeted GPU architecture
-     *
-     * @param[in] target The targeted GPU architecture
-     */
-    void set_target(GPUTarget target)
-    {
-        _target = target;
-    }
-
-    /** Set the targeted GPU architecture according to the CL device
-     *
-     * @param[in] device A CL device
-     */
-    void set_target(cl::Device &device);
-
-    /** Get the targeted GPU architecture
-     *
-     * @return The targeted GPU architecture.
-     */
-    GPUTarget get_target() const
-    {
-        return _target;
-    }
-
-    /** Get the maximum workgroup size for the device the CLKernelLibrary uses.
-     *
-     * @return The maximum workgroup size value.
-     */
-    size_t get_max_workgroup_size();
-    /** Get the global work size given an execution window
-     *
-     * @param[in] window Execution window
-     *
-     * @return Global work size of the given execution window
-     */
-    static cl::NDRange gws_from_window(const Window &window);
-
-private:
-    /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     array          Array to set as an argument of the object's kernel.
-     * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
-     * @param[in]     num_dimensions Number of dimensions of the @p array.
-     * @param[in]     window         Window the kernel will be executed on.
-     */
-    template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
-    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
-     * @param[in]     tensor Tensor to set as an argument of the object's kernel.
-     * @param[in]     window Window the kernel will be executed on.
-     */
-    template <unsigned int dimension_size>
-    void add_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window);
-
-protected:
-    cl::Kernel  _kernel;             /**< OpenCL kernel to run */
-    GPUTarget   _target;             /**< The targeted GPU */
-    std::string _config_id;          /**< Configuration ID */
-    size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
-private:
-    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
-};
-
-/** Add the kernel to the command queue with the given window.
- *
- * @note Depending on the size of the window, this might translate into several jobs being enqueued.
- *
- * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
- *
- * @param[in,out] queue                OpenCL command queue.
- * @param[in]     kernel               Kernel to enqueue
- * @param[in]     window               Window the kernel has to process.
- * @param[in]     lws_hint             (Optional) Local workgroup size requested. Default is based on the device target.
- * @param[in]     use_dummy_work_items (Optional) Use dummy work items in order to have two dimensional power of two NDRange. Default is false
- *                                     Note: it is kernel responsibility to check if the work-item is out-of-range
- *
- * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
- */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
-
-/** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
- *
- * @param[in,out] idx            Index at which to start adding the array's arguments. Will be incremented by the number of kernel arguments set.
- * @param[in]     array          Array to set as an argument of the object's kernel.
- * @param[in]     strides        @ref Strides object containing stride of each dimension in bytes.
- * @param[in]     num_dimensions Number of dimensions of the @p array.
- * @param[in]     window         Window the kernel will be executed on.
- */
-template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
-{
-    ARM_COMPUTE_ERROR_ON(array == nullptr);
-
-    // Calculate offset to the start of the window
-    unsigned int offset_first_element = 0;
-
-    for(unsigned int n = 0; n < num_dimensions; ++n)
-    {
-        offset_first_element += window[n].start() * strides[n];
-    }
-
-    unsigned int idx_start = idx;
-    _kernel.setArg(idx++, array->cl_buffer());
-
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
-    {
-        _kernel.setArg<cl_uint>(idx++, strides[dimension]);
-        _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
-    }
-
-    _kernel.setArg<cl_uint>(idx++, offset_first_element);
-
-    ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
-    ARM_COMPUTE_UNUSED(idx_start);
-}
-}
-#endif /*ARM_COMPUTE_ICLKERNEL_H */
diff --git a/arm_compute/core/CL/ICLLut.h b/arm_compute/core/CL/ICLLut.h
deleted file mode 100644
index 430adb8727..0000000000
--- a/arm_compute/core/CL/ICLLut.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLLUT_H
-#define ARM_COMPUTE_ICLLUT_H
-
-#include "arm_compute/core/ILut.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-class CommandQueue;
-}
-
-namespace arm_compute
-{
-/** Interface for OpenCL LUT */
-class ICLLut : public ILut
-{
-public:
-    ICLLut();
-    ICLLut(const ICLLut &) = delete;
-    ICLLut &operator=(const ICLLut &) = delete;
-
-    /** Interface to be implemented by the child class to return a reference to the OpenCL buffer containing the lut's data.
-     *
-     * @return A reference to an OpenCL buffer containing the lut's data.
-     */
-    virtual const cl::Buffer &cl_buffer() const = 0;
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    void map(cl::CommandQueue &q, bool blocking = true);
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    void unmap(cl::CommandQueue &q);
-
-    // Inherited methods overridden:
-    uint8_t *buffer() const override;
-
-protected:
-    /** Method to be implemented by the child class to map the OpenCL buffer
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint8_t *do_map(cl::CommandQueue &q, bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the OpenCL buffer
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q The CL command queue to use for the mapping operation.
-     */
-    virtual void do_unmap(cl::CommandQueue &q) = 0;
-
-private:
-    uint8_t *_mapping;
-};
-}
-#endif /*ARM_COMPUTE_ICLLUT_H */
diff --git a/arm_compute/core/CL/ICLMultiHOG.h b/arm_compute/core/CL/ICLMultiHOG.h
deleted file mode 100644
index f9213018a2..0000000000
--- a/arm_compute/core/CL/ICLMultiHOG.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLMULTIHOG_H
-#define ARM_COMPUTE_ICLMULTIHOG_H
-
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/IMultiHOG.h"
-
-namespace arm_compute
-{
-/** Interface for storing multiple HOG data-objects */
-class ICLMultiHOG : public IMultiHOG
-{
-public:
-    /** Return a pointer to the requested OpenCL HOG model
-     *
-     * @param[in] index The index of the wanted OpenCL HOG model.
-     *
-     *  @return A pointer pointed to the HOG model
-     */
-    virtual ICLHOG *cl_model(size_t index) = 0;
-    /** Return a constant pointer to the requested OpenCL HOG model
-     *
-     * @param[in] index The index of the wanted OpenCL HOG model.
-     *
-     *  @return A constant pointer pointed to the OpenCL HOG model
-     */
-    virtual const ICLHOG *cl_model(size_t index) const = 0;
-
-    // Inherited methods overridden:
-    IHOG *model(size_t index) override;
-    const IHOG *model(size_t index) const override;
-};
-}
-#endif /*ARM_COMPUTE_ICLMULTIHOG_H */
diff --git a/arm_compute/core/CL/ICLMultiImage.h b/arm_compute/core/CL/ICLMultiImage.h
deleted file mode 100644
index 0233600e73..0000000000
--- a/arm_compute/core/CL/ICLMultiImage.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLMULTIIMAGE_H
-#define ARM_COMPUTE_ICLMULTIIMAGE_H
-
-#include "arm_compute/core/IMultiImage.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for OpenCL images */
-using ICLImage = ICLTensor;
-
-/** Interface for OpenCL multi-planar images */
-class ICLMultiImage : public IMultiImage
-{
-public:
-    /** Return a pointer to the requested OpenCL plane of the image.
-     *
-     * @param[in] index The index of the wanted planed.
-     *
-     * @return A pointer pointed to the OpenCL plane
-     */
-    virtual ICLImage *cl_plane(unsigned int index) = 0;
-    /** Return a constant pointer to the requested OpenCL plane of the image.
-     *
-     * @param[in] index The index of the wanted planed.
-     *
-     * @return A constant pointer pointed to the OpenCL plane
-     */
-    virtual const ICLImage *cl_plane(unsigned int index) const = 0;
-
-    // Inherited methods overridden:
-    IImage *plane(unsigned int index) override;
-    const IImage *plane(unsigned int index) const override;
-};
-}
-#endif /*ARM_COMPUTE_ICLMULTIIMAGE_H */
diff --git a/arm_compute/core/CL/ICLSimple2DKernel.h b/arm_compute/core/CL/ICLSimple2DKernel.h
deleted file mode 100644
index bd423303bb..0000000000
--- a/arm_compute/core/CL/ICLSimple2DKernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
-#define ARM_COMPUTE_ICLSIMPLE2DKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
-class ICLSimple2DKernel : public ICLSimpleKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-}
-#endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */
diff --git a/arm_compute/core/CL/ICLSimple3DKernel.h b/arm_compute/core/CL/ICLSimple3DKernel.h
deleted file mode 100644
index e25051f578..0000000000
--- a/arm_compute/core/CL/ICLSimple3DKernel.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
-#define ARM_COMPUTE_ICLSIMPLE3DKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output.
- *  Both input tensor and output tensor must have at least 3 dimensions.
- */
-class ICLSimple3DKernel : public ICLSimple2DKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-}
-#endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */
diff --git a/arm_compute/core/CL/ICLSimpleKernel.h b/arm_compute/core/CL/ICLSimpleKernel.h
deleted file mode 100644
index e8b6f0a81c..0000000000
--- a/arm_compute/core/CL/ICLSimpleKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLSIMPLEKERNEL_H
-#define ARM_COMPUTE_ICLSIMPLEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-
-namespace arm_compute
-{
-/** Interface for simple OpenCL kernels having 1 tensor input and 1 tensor output */
-class ICLSimpleKernel : public ICLKernel
-{
-public:
-    /** Constructor. */
-    ICLSimpleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLSimpleKernel(const ICLSimpleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLSimpleKernel &operator=(const ICLSimpleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLSimpleKernel(ICLSimpleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLSimpleKernel &operator=(ICLSimpleKernel &&) = default;
-    /** Default destructor */
-    ~ICLSimpleKernel() = default;
-
-    /** Configure the kernel
-     *
-     * @param[in]  input                             Source tensor.
-     * @param[out] output                            Destination tensor.
-     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  border_size                       (Optional) Size of the border.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-}
-
-#endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */
diff --git a/arm_compute/core/CL/ICLTensor.h b/arm_compute/core/CL/ICLTensor.h
index 001f892231..8de5423762 100644
--- a/arm_compute/core/CL/ICLTensor.h
+++ b/arm_compute/core/CL/ICLTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTENSOR_H
 #define ARM_COMPUTE_ICLTENSOR_H
 
-#include "arm_compute/core/ITensor.h"
-
 #include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/ITensor.h"
 
 #include <cstdint>
 
@@ -34,7 +33,7 @@ namespace cl
 {
 class Buffer;
 class CommandQueue;
-}
+} // namespace cl
 
 namespace arm_compute
 {
@@ -71,8 +70,6 @@ public:
      * @param[in]     blocking If true, then the mapping will be ready to use by the time
      *                         this method returns, else it is the caller's responsibility
      *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
      */
     void map(cl::CommandQueue &q, bool blocking = true);
     /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
@@ -115,5 +112,5 @@ private:
 };
 
 using ICLImage = ICLTensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTENSOR_H */
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 72cbb3d2b2..8b5bf97099 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OPENCL_H
-#define ARM_COMPUTE_OPENCL_H
+#ifndef ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
+#define ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
 
 #include <string>
 #include <utility>
@@ -31,8 +31,8 @@
 #ifndef ARM_COMPUTE_NO_EXCEPTIONS
 #define CL_HPP_ENABLE_EXCEPTIONS
 #endif // ARM_COMPUTE_NO_EXCEPTIONS
-#define CL_TARGET_OPENCL_VERSION 200
-#define CL_HPP_TARGET_OPENCL_VERSION 110
+#define CL_TARGET_OPENCL_VERSION      300
+#define CL_HPP_TARGET_OPENCL_VERSION  110
 #define CL_HPP_MINIMUM_OPENCL_VERSION 110
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Weffc++"
@@ -40,8 +40,8 @@
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #if defined(__GNUG__) && __GNUG__ >= 8
 #pragma GCC diagnostic ignored "-Wcatch-value"
-#endif // defined(__GNUG__) && __GNUG__ >= 8
-#include <CL/cl2.hpp>
+#endif                   // defined(__GNUG__) && __GNUG__ >= 8
+#include <CL/opencl.hpp> // include new hpp header instead of cl2.hpp
 #pragma GCC diagnostic pop
 
 namespace cl
@@ -73,25 +73,27 @@ public:
      * @return The static instance of CLSymbols.
      */
     static CLSymbols &get();
-    /** Load symbols from the given OpenCL library path.
+    /** This method attempts to load the OpenCL symbols from the first available library from the provided OpenCL libraries.
      *
-     * @param[in] library Path to the OpenCL library.
+     * @param[in] libraries_filenames Vector containing the filenames of the libraries to be loaded.
+     * @param[in] use_loader          Use symbol loader function loadOpenCLPointer.
      *
-     * @return True if loading the library is successful.
+     * @return True if loading the library is successful. False if all the provided libraries could not be loaded.
      */
-    bool load(const std::string &library);
+    bool load(const std::vector<std::string> &libraries_filenames, bool use_loader = false);
     /** Load symbols from any of the default OpenCL library names.
+     *  If all the default libraries could not be loaded, this method will print a warning message and return false.
      *
      * @return True if loading any library is successful.
      */
     bool load_default();
 
-#define DECLARE_FUNCTION_PTR(func_name) \
-    std::function<decltype(func_name)> func_name##_ptr = nullptr
+#define DECLARE_FUNCTION_PTR(func_name) std::function<decltype(func_name)> func_name##_ptr = nullptr
 
     DECLARE_FUNCTION_PTR(clCreateContext);
     DECLARE_FUNCTION_PTR(clCreateContextFromType);
     DECLARE_FUNCTION_PTR(clCreateCommandQueue);
+    DECLARE_FUNCTION_PTR(clCreateCommandQueueWithProperties);
     DECLARE_FUNCTION_PTR(clGetContextInfo);
     DECLARE_FUNCTION_PTR(clBuildProgram);
     DECLARE_FUNCTION_PTR(clEnqueueNDRangeKernel);
@@ -123,6 +125,7 @@ public:
     DECLARE_FUNCTION_PTR(clGetDeviceIDs);
     DECLARE_FUNCTION_PTR(clGetMemObjectInfo);
     DECLARE_FUNCTION_PTR(clRetainEvent);
+    DECLARE_FUNCTION_PTR(clGetPlatformInfo);
     DECLARE_FUNCTION_PTR(clGetPlatformIDs);
     DECLARE_FUNCTION_PTR(clGetKernelWorkGroupInfo);
     DECLARE_FUNCTION_PTR(clGetCommandQueueInfo);
@@ -135,6 +138,18 @@ public:
     DECLARE_FUNCTION_PTR(clEnqueueMarker);
     DECLARE_FUNCTION_PTR(clWaitForEvents);
     DECLARE_FUNCTION_PTR(clCreateImage);
+    DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
+    DECLARE_FUNCTION_PTR(clGetExtensionFunctionAddressForPlatform);
+
+    // Command buffer and mutable dispatch command buffer extensions
+    DECLARE_FUNCTION_PTR(clCreateCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clRetainCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clReleaseCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clFinalizeCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clEnqueueCommandBufferKHR);
+    DECLARE_FUNCTION_PTR(clCommandNDRangeKernelKHR);
+
+    DECLARE_FUNCTION_PTR(clUpdateMutableCommandsKHR);
 
     // Third-party extensions
     DECLARE_FUNCTION_PTR(clImportMemoryARM);
@@ -145,4 +160,4 @@ private:
     std::pair<bool, bool> _loaded;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_OPENCL_H */
+#endif // ACL_ARM_COMPUTE_CORE_CL_OPENCL_H
diff --git a/arm_compute/core/CL/gemm/CLGEMMHelpers.h b/arm_compute/core/CL/gemm/CLGEMMHelpers.h
deleted file mode 100644
index dcda732c2d..0000000000
--- a/arm_compute/core/CL/gemm/CLGEMMHelpers.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMHELPERS_H
-#define ARM_COMPUTE_CLGEMMHELPERS_H
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- *
- * @param[in] m              Number of rows (M) in the LHS matrix not reshaped
- * @param[in] n              Number of columns (N) in the RHS matrix not reshaped
- * @param[in] m0             Number of rows processed by each thread/work-item
- * @param[in] n0             Number of columns processed by each thread/work-item
- * @param[in] k0             Number of inner accumulation performed by each thread/work-item
- * @param[in] v0             Number of vertical blocks of size (m0xk0) stored on the same output row
- * @param[in] h0             Number of horizontal blocks of size (k0xn0) stored on the same output row
- * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row
- * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row
- * @param[in] lhs_transpose  True if the (m0xk0) block has to be transposed before been stored
- * @param[in] rhs_transpose  True if the (k0xn0) block has to be transposed before been stored
- *
- * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
- */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose);
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMHELPERS_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
deleted file mode 100644
index a6341e5094..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
-#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMNative factory class */
-class CLGEMMNativeKernelConfigurationFactory final
-{
-public:
-    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMNative kernel configuration class
-     */
-    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationMidgard>(gpu);
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
deleted file mode 100644
index 5b2abe6f0f..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMNative configuration */
-class CLGEMMNativeKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMNativeKernelConfigurationBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONBIFROST_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
deleted file mode 100644
index 0e95a15613..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Midgard based OpenCL GEMMNative configuration */
-class CLGEMMNativeKernelConfigurationMidgard final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
deleted file mode 100644
index e739997b3a..0000000000
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Valhall based OpenCL GEMMNative configuration */
-class CLGEMMNativeKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMNativeKernelConfigurationValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONVALHALL_H */
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
deleted file mode 100644
index 10dc9aefdb..0000000000
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMReshaped factory class */
-class CLGEMMReshapedKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshaped kernel configuration class
-     */
-    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
deleted file mode 100644
index 55742e3e56..0000000000
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMReshaped configuration */
-class CLGEMMReshapedKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONBIFROST_H */
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
deleted file mode 100644
index e65974144d..0000000000
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Valhall based OpenCL GEMMReshaped configuration */
-class CLGEMMReshapedKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedKernelConfigurationValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDKERNELCONFIGURATIONVALHALL_H */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
deleted file mode 100644
index 7909726164..0000000000
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMReshapedOnlyRHS factory class */
-class CLGEMMReshapedOnlyRHSKernelConfigurationFactory final
-{
-public:
-    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMReshapedOnlyRHS kernel configuration class
-     */
-    static std::unique_ptr<ICLGEMMKernelConfiguration> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATION_H */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
deleted file mode 100644
index 044bdc7b18..0000000000
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
-class CLGEMMReshapedOnlyRHSKernelConfigurationBifrost final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONBIFROST_H */
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
deleted file mode 100644
index 6dba6fdb00..0000000000
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H
-
-#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
-class CLGEMMReshapedOnlyRHSKernelConfigurationValhall final : public ICLGEMMKernelConfiguration
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu);
-
-    // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
-
-private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMRESHAPEDONLYRHSKERNELCONFIGURATIONVALHALL_H */
diff --git a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h b/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
deleted file mode 100644
index 58dea3bdae..0000000000
--- a/arm_compute/core/CL/kernels/CLAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the absolute difference kernel.
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class CLAbsoluteDifferenceKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLAbsoluteDifferenceKernel &operator=(const CLAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel(CLAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLAbsoluteDifferenceKernel &operator=(CLAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~CLAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
-     * @param[out] output Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2          Source tensor. Data types supported: U8/S16.
-     * @param[out] output          Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1. */
-    const ICLTensor *_input2; /**< Source tensor 2. */
-    ICLTensor       *_output; /**< Destination tensor. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLAccumulateKernel.h b/arm_compute/core/CL/kernels/CLAccumulateKernel.h
deleted file mode 100644
index f639148e25..0000000000
--- a/arm_compute/core/CL/kernels/CLAccumulateKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACCUMULATEKERNEL_H
-#define ARM_COMPUTE_CLACCUMULATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the accumulate kernel.
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class CLAccumulateKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  input Source tensor. Data types supported: U8.
-     * @param[out] accum Destination tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *accum);
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] accum           Destination tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
-};
-
-/** Interface for the accumulate weighted kernel.
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class CLAccumulateWeightedKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     alpha Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
-    /** Set the input and accumulation images, and the scale value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     alpha           Scalar value in the range [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
-};
-
-/** Interface for the accumulate squared kernel.
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class CLAccumulateSquaredKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     shift           Shift value in the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLACCUMULATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h b/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
deleted file mode 100644
index 1e83a689cd..0000000000
--- a/arm_compute/core/CL/kernels/CLActivationLayerKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the activation layer kernel. */
-class CLActivationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLActivationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLActivationLayerKernel(const CLActivationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLActivationLayerKernel &operator=(const CLActivationLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLActivationLayerKernel(CLActivationLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLActivationLayerKernel &operator=(CLActivationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLActivationLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output   Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info Activation layer information.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info        Activation layer information.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[in] output   Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info Activation layer information.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLACTIVATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
deleted file mode 100644
index 94e8baed13..0000000000
--- a/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel
- *
- * @note The default data type for an uninitialized output tensor is
- *       signed 32-bit integer (S32). It is the user's responsibility to check
- *       that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class CLArgMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLArgMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLArgMinMaxLayerKernel(const CLArgMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLArgMinMaxLayerKernel &operator=(const CLArgMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLArgMinMaxLayerKernel(CLArgMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLArgMinMaxLayerKernel &operator=(CLArgMinMaxLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLArgMinMaxLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input       Source tensor. Data types supported: S32/F16/F32.
-     * @param[in]  prev_output Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                         Has to be nullptr for the first iteration
-     * @param[out] output      Destination tensor. Data types supported: U32/S32
-     *                         Output will have the same number of dimensions as input.
-     * @param[in]  axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op          Reduction operation to perform. Only ArgMin and ArgMax are supported.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32.
-     * @param[in]  prev_output     Destination tensor of the previous iterations of @ref CLArgMinMaxLayerKernel. Data types supported: U32/S32
-     *                             Has to be nullptr for the first iteration
-     * @param[out] output          Destination tensor. Data types supported: U32/S32
-     *                             Output will have the same number of dimensions as input.
-     * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *prev_output, ICLTensor *output, unsigned int axis, ReductionOperation op);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
-     *
-     * @param[in] input       Source tensor info. Data types supported: S32/F16/F32.
-     * @param[in] prev_output Destination tensor info of the previous iterations. Data types supported: U32/S32
-     *                        Has to be nullptr for the first iteration
-     * @param[in] output      Destination tensor info. Data types supported: U32/S32
-     *                        Output will have the same number of dimensions as input.
-     * @param[in] axis        Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in] op          Reduction operation to perform.  Only ArgMin and ArgMax are supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *prev_output, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor   *_input;
-    const ICLTensor   *_prev_output;
-    ICLTensor         *_output;
-    unsigned int       _reduction_axis;
-    ReductionOperation _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
deleted file mode 100644
index 163666853c..0000000000
--- a/arm_compute/core/CL/kernels/CLBatchConcatenateLayerKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLBatchConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBatchConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchConcatenateLayerKernel &operator=(const CLBatchConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel(CLBatchConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBatchConcatenateLayerKernel &operator=(CLBatchConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     batch_offset    The offset on axis # 3.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int batch_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLBatchConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _batch_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBATCHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
deleted file mode 100644
index 8eaaca845a..0000000000
--- a/arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the BatchNormalization layer kernel.
- */
-class CLBatchNormalizationLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLBatchNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchNormalizationLayerKernel(const CLBatchNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchNormalizationLayerKernel &operator=(const CLBatchNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLBatchNormalizationLayerKernel(CLBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLBatchNormalizationLayerKernel &operator=(CLBatchNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchNormalizationLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                          The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]      gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                                 3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                                 The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out]     output          Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]      mean            Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      var             Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      beta            (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]      gamma           (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
-     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor       *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_mean;
-    const ICLTensor *_var;
-    const ICLTensor *_beta;
-    const ICLTensor *_gamma;
-    float            _epsilon;
-    bool             _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
deleted file mode 100644
index 2b12ad094a..0000000000
--- a/arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the batch to space kernel */
-class CLBatchToSpaceLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBatchToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchToSpaceLayerKernel(const CLBatchToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBatchToSpaceLayerKernel &operator=(const CLBatchToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBatchToSpaceLayerKernel(CLBatchToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBatchToSpaceLayerKernel &operator=(CLBatchToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLBatchToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
-    /** Initialise the kernel's inputs and output (Static block shape).
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
-    /** Initialise the kernel's inputs and output (Static block shape).
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x   Block shape x value.
-     * @param[in]  block_shape_y   Block shape y value.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel (Static block shape).
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    const ICLTensor *_block_shape; /**< Block shape tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h b/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
deleted file mode 100644
index 8defe32862..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseAndKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISEANDKERNEL_H
-#define ARM_COMPUTE_CLBITWISEANDKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise AND operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
- */
-class CLBitwiseAndKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLBitwiseAndKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseAndKernel(const CLBitwiseAndKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseAndKernel &operator=(const CLBitwiseAndKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBitwiseAndKernel(CLBitwiseAndKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBitwiseAndKernel &operator=(CLBitwiseAndKernel &&) = default;
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8.
-     * @param[in]  input2 Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8.
-     * @param[in]  input2          Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISEANDKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h b/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
deleted file mode 100644
index b86ce7f173..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseNotKernel.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISENOTKERNEL_H
-#define ARM_COMPUTE_CLBITWISENOTKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise NOT operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = \lnot input(x,y) @f]
- */
-class CLBitwiseNotKernel : public ICLSimple2DKernel
-{
-public:
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the inputs and output images.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISENOTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h b/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
deleted file mode 100644
index 65eb50f0fd..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseOrKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISEORKERNEL_H
-#define ARM_COMPUTE_CLBITWISEORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise OR operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
- */
-class CLBitwiseOrKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLBitwiseOrKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseOrKernel(const CLBitwiseOrKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseOrKernel &operator=(const CLBitwiseOrKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBitwiseOrKernel(CLBitwiseOrKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBitwiseOrKernel &operator=(CLBitwiseOrKernel &&) = default;
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8.
-     * @param[in]  input2 Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8.
-     * @param[in]  input2          Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISEORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h b/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
deleted file mode 100644
index 5c63a7f22c..0000000000
--- a/arm_compute/core/CL/kernels/CLBitwiseXorKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBITWISEXORKERNEL_H
-#define ARM_COMPUTE_CLBITWISEXORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bitwise XOR operation kernel.
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
- */
-class CLBitwiseXorKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLBitwiseXorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseXorKernel(const CLBitwiseXorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBitwiseXorKernel &operator=(const CLBitwiseXorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBitwiseXorKernel(CLBitwiseXorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBitwiseXorKernel &operator=(CLBitwiseXorKernel &&) = default;
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8.
-     * @param[in]  input2 Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Set the inputs and output images
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8.
-     * @param[in]  input2          Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLBITWISEXORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h b/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
deleted file mode 100644
index bbe11562ed..0000000000
--- a/arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the bounding box kernel */
-class CLBoundingBoxTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLBoundingBoxTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBoundingBoxTransformKernel(const CLBoundingBoxTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLBoundingBoxTransformKernel &operator=(const CLBoundingBoxTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLBoundingBoxTransformKernel(CLBoundingBoxTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLBoundingBoxTransformKernel &operator=(CLBoundingBoxTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLBoundingBoxTransformKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                        Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
-     * @param[in]  info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  boxes           Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[out] pred_boxes      Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in]  deltas          Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                             Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
-     * @param[in]  info            Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
-     *
-     * @param[in] boxes      Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in] deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                       Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input
-     * @param[in] info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_boxes;
-    ICLTensor       *_pred_boxes;
-    const ICLTensor *_deltas;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBOUNDINGBOXTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h b/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
deleted file mode 100644
index ea3c1c1f3e..0000000000
--- a/arm_compute/core/CL/kernels/CLBox3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOX3X3KERNEL_H
-#define ARM_COMPUTE_CLBOX3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the box 3x3 filter kernel.
- *
- */
-class CLBox3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    //Inherited methods overriden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLBOX3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h b/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
deleted file mode 100644
index 40ad4dcd84..0000000000
--- a/arm_compute/core/CL/kernels/CLCannyEdgeKernel.h
+++ /dev/null
@@ -1,187 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-#define ARM_COMPUTE_CLCANNYEDGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Gradient computation.
- */
-class CLGradientKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel(const CLGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGradientKernel &operator=(const CLGradientKernel &) = delete;
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase     Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and mag must all be the same size (either 16 or 32).
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              Source tensor - Gx component. Data types supported: S16/S32.
-     * @param[in]  gy              Source tensor - Gy component. Data types supported: Same as gx.
-     * @param[out] magnitude       Destination tensor - Magnitude. Data types supported: U16/U32. Must match the pixel size of gx, gy.
-     * @param[out] phase           Destination tensor - Quantized phase. Data types supported: U8.
-     * @param[in]  norm_type       Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Source tensor - Gx component */
-    const ICLTensor *_gy;        /**< Source tensor - Gy component */
-    ICLTensor       *_magnitude; /**< Destination tensor - Magnitude */
-    ICLTensor       *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** OpenCL kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. The output buffer needs to be cleared before this kernel is executed.
- *
- * @note Hysteresis is computed in @ref CLEdgeTraceKernel
- */
-class CLEdgeNonMaxSuppressionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeNonMaxSuppressionKernel &operator=(const CLEdgeNonMaxSuppressionKernel &) = delete;
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  magnitude        Source tensor - Magnitude. Data types supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16/U32.
-     * @param[in]  lower_thr        Lower threshold.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *magnitude, const ICLTensor *phase, ICLTensor *output, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_magnitude; /**< Source tensor - Magnitude. */
-    const ICLTensor *_phase;     /**< Source tensor - Quantized phase. */
-    ICLTensor       *_output;    /**< Destination tensor. */
-};
-
-/** OpenCL kernel to perform Edge tracing.
- */
-class CLEdgeTraceKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel(const CLEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLEdgeTraceKernel &operator=(const CLEdgeTraceKernel &) = delete;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     input            Source tensor. Data types supported: U8.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]     compile_context  The compile context to be used.
-     * @param[in]     input            Source tensor. Data types supported: U8.
-     * @param[out]    output           Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr        Upper threshold used for the hysteresis
-     * @param[in]     lower_thr        Lower threshold used for the hysteresis
-     * @param[in,out] visited          Tensor for keeping the visited pixels. Data types supported: U32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] recorded         Tensor for keeping the recorded pixels. Data types supported: U32
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack         Tensor with the L1 stack for each pixel. Data types supported: S32.
-     *                                 Expected to be initialized to 0 before each run.
-     * @param[in,out] l1_stack_counter Tensor for counting the elements in the L1 stack of each pixel. Data types supported: U8.
-     *                                              Expected to be initialized to 0 before each run.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr,
-                   ICLTensor *visited, ICLTensor *recorded, ICLTensor *l1_stack, ICLTensor *l1_stack_counter);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;            /**< Source tensor. */
-    ICLTensor       *_output;           /**< Destination tensor. */
-    int32_t          _lower_thr;        /**< Lower threshold used for the hysteresis. */
-    int32_t          _upper_thr;        /**< Upper threshold used for the hysteresis. */
-    ICLTensor       *_visited;          /**< Marks visited elements */
-    ICLTensor       *_recorded;         /**< Marks recorded elements */
-    ICLTensor       *_l1_stack;         /**< L1 hysteris stack */
-    ICLTensor       *_l1_stack_counter; /**< L1 hysteris stack counter */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCANNYEDGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h b/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
deleted file mode 100644
index 32ddf152c3..0000000000
--- a/arm_compute/core/CL/kernels/CLChannelCombineKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel combine kernel */
-class CLChannelCombineKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel(const CLChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelCombineKernel &operator=(const CLChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel(CLChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelCombineKernel &operator=(CLChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelCombineKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor.
-     */
-    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output          The single planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output tensor.
-     */
-    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    std::array<const ICLTensor *, 4> _planes;
-    ICLTensor     *_output;
-    ICLMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELCOMBINEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h b/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
deleted file mode 100644
index 6a0c4bb94e..0000000000
--- a/arm_compute/core/CL/kernels/CLChannelExtractKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the channel extract kernel */
-class CLChannelExtractKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel(const CLChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelExtractKernel &operator=(const CLChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel(CLChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelExtractKernel &operator=(CLChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelExtractKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Must be of U8 format.
-     */
-    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Destination tensor. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel         Channel to extract.
-     * @param[out] output          Single-planar 2D destination image. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    uint32_t         _num_elems_processed_per_iteration;
-    uint32_t         _subsampling;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELEXTRACTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h b/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
deleted file mode 100644
index 14b59d325f..0000000000
--- a/arm_compute/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
-#define ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the channel shuffle kernel */
-class CLChannelShuffleLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLChannelShuffleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelShuffleLayerKernel(const CLChannelShuffleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLChannelShuffleLayerKernel &operator=(const CLChannelShuffleLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLChannelShuffleLayerKernel(CLChannelShuffleLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLChannelShuffleLayerKernel &operator=(CLChannelShuffleLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLChannelShuffleLayerKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  input      Input tensor. Data types supported: All.
-     * @param[out] output     Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
-     *
-     * @param[in] input      Input tensor info. Data types supported: All.
-     * @param[in] output     Output tensor info. Data type supported: Same as @p input
-     * @param[in] num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCHANNELSHUFFLELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
deleted file mode 100644
index d0528ed21a..0000000000
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOL2IMKERNEL_H
-#define ARM_COMPUTE_CLCOL2IMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref CLIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class CLCol2ImKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCol2ImKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel(const CLCol2ImKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCol2ImKernel &operator=(const CLCol2ImKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel(CLCol2ImKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
-    /** Default destructor */
-    ~CLCol2ImKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims Output convolved dimensions.
-     * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in]  convolved_dims  Output convolved dimensions.
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
-     * @param[in] convolved_dims Output convolved dimensions.
-     * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOL2IMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLColorConvertKernel.h b/arm_compute/core/CL/kernels/CLColorConvertKernel.h
deleted file mode 100644
index 2bcd141863..0000000000
--- a/arm_compute/core/CL/kernels/CLColorConvertKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-#define ARM_COMPUTE_CLCOLORCONVERTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the color convert kernel.
- *
- */
-class CLColorConvertKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel(const CLColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLColorConvertKernel &operator=(const CLColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel(CLColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLColorConvertKernel &operator=(CLColorConvertKernel &&) = default;
-    /** Default destructor. */
-    ~CLColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const ICLMultiImage *input, ICLMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor     *_input;        /*pointer to single planar tensor input */
-    ICLTensor           *_output;       /*pointer to single planar tensor output */
-    const ICLMultiImage *_multi_input;  /*pointer to multi-planar input */
-    ICLMultiImage       *_multi_output; /*pointer to multi-planar output */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOLORCONVERTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLComparisonKernel.h b/arm_compute/core/CL/kernels/CLComparisonKernel.h
deleted file mode 100644
index d5c5297c61..0000000000
--- a/arm_compute/core/CL/kernels/CLComparisonKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOMPARISONKERNEL_H
-#define ARM_COMPUTE_CLCOMPARISONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the comparison kernel. */
-class CLComparisonKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLComparisonKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComparisonKernel(const CLComparisonKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComparisonKernel &operator=(const CLComparisonKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComparisonKernel(CLComparisonKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComparisonKernel &operator=(CLComparisonKernel &&) = default;
-    /** Default destructor */
-    ~CLComparisonKernel() = default;
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  input1    Source tensor. Data types supported: All.
-     * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
-     * @param[out] output    Destination tensor. Data types supported: U8.
-     * @param[in]  operation Comparison operation to use.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: All.
-     * @param[in]  input2          Source tensor. Data types supported: Same as @p input1.
-     * @param[out] output          Destination tensor. Data types supported: U8.
-     * @param[in]  operation       Comparison operation to use.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
-     *
-     * @param[in] input1    Source tensor. Data types supported: All.
-     * @param[in] input2    Source tensor. Data types supported: Same as @p input1.
-     * @param[in] output    Destination tensor. Data types supported: U8.
-     * @param[in] operation Comparison operation to use.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPARISONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h b/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index d3e57a6738..0000000000
--- a/arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class CLConvertFullyConnectedWeightsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvertFullyConnectedWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvertFullyConnectedWeightsKernel(const CLConvertFullyConnectedWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvertFullyConnectedWeightsKernel &operator=(const CLConvertFullyConnectedWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvertFullyConnectedWeightsKernel(CLConvertFullyConnectedWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvertFullyConnectedWeightsKernel &operator=(CLConvertFullyConnectedWeightsKernel &&) = default;
-    /** Default destructor */
-    ~CLConvertFullyConnectedWeightsKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Set the input and output tensor.
-     *
-     * @param[in]  compile_context      The compile context to be used.
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
-     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLConvolutionKernel.h b/arm_compute/core/CL/kernels/CLConvolutionKernel.h
deleted file mode 100644
index b6fe51dbaa..0000000000
--- a/arm_compute/core/CL/kernels/CLConvolutionKernel.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_CLCONVOLUTIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class CLConvolutionKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a 3x3 convolution to a tensor. */
-using CLConvolution3x3Kernel = CLConvolutionKernel<3>;
-/** Interface for the kernel which applies a 5x5 convolution to a tensor. */
-using CLConvolution5x5Kernel = CLConvolutionKernel<5>;
-/** Interface for the kernel which applies a 7x7 convolution to a tensor. */
-using CLConvolution7x7Kernel = CLConvolutionKernel<7>;
-/** Interface for the kernel which applies a 9x9 convolution to a tensor. */
-using CLConvolution9x9Kernel = CLConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution. Currently support 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default Constructor */
-    CLSeparableConvolutionHorKernel();
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-
-/** Interface for the kernel which applies a horizontal pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5HorKernel = CLSeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applies a horizontal pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7HorKernel = CLSeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applies a horizontal pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9HorKernel = CLSeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution. Currently supports 5x5, 7x7, 9x9 */
-template <unsigned int matrix_size>
-class CLSeparableConvolutionVertKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: S16.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: S16.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  data_type        Data type to use for intermeidate result. @sa data_type_for_convolution
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, bool border_undefined, DataType data_type = DataType::S32);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-
-/** Interface for the kernel which applies a vertical pass of 5x5 convolution to a tensor. */
-using CLSeparableConvolution5x5VertKernel = CLSeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applies a vertical pass of 7x7 convolution to a tensor. */
-using CLSeparableConvolution7x7VertKernel = CLSeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applies a vertical pass of 9x9 convolution to a tensor. */
-using CLSeparableConvolution9x9VertKernel = CLSeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class CLConvolutionRectangleKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel(const CLConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLConvolutionRectangleKernel &operator=(const CLConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel(CLConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLConvolutionRectangleKernel &operator=(CLConvolutionRectangleKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCONVOLUTIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLCopyKernel.h b/arm_compute/core/CL/kernels/CLCopyKernel.h
deleted file mode 100644
index 05dff8ed0c..0000000000
--- a/arm_compute/core/CL/kernels/CLCopyKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOPYKERNEL_H
-#define ARM_COMPUTE_CLCOPYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a copy between two tensors */
-class CLCopyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCopyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCopyKernel(const CLCopyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCopyKernel &operator=(const CLCopyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCopyKernel(CLCopyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCopyKernel &operator=(CLCopyKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input         Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[out] output        Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding       (Optional) Padding to be applied to the input tensor
-     * @param[in]  output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[out] output          Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding         (Optional) Padding to be applied to the input tensor
-     * @param[in]  output_window   (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLCopyKernel
-     *
-     * @param[in] input         Source tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[in] output        Destination tensor info. Data types supported: same as @p input.
-     * @param[in] padding       (Optional) Padding to be applied to the input tensor
-     * @param[in] output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Window           _output_window;
-    bool             _has_output_window;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCOPYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
deleted file mode 100644
index 0c65f519cc..0000000000
--- a/arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
-#define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Deconvolution layer kernel on OpenCL.
- */
-class CLDeconvolutionLayerUpsampleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLDeconvolutionLayerUpsampleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionLayerUpsampleKernel(const CLDeconvolutionLayerUpsampleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionLayerUpsampleKernel &operator=(const CLDeconvolutionLayerUpsampleKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDeconvolutionLayerUpsampleKernel(CLDeconvolutionLayerUpsampleKernel &&) = default;
-    /** Default move assignment operator */
-    CLDeconvolutionLayerUpsampleKernel &operator=(CLDeconvolutionLayerUpsampleKernel &&) = default;
-    /** Default destructor */
-    ~CLDeconvolutionLayerUpsampleKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: All.
-     * @param[out] output Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info   Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: All.
-     * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
-     *
-     * @param[in] input  Source tensor info. Data types supported: All.
-     * @param[in] output Destination tensor info. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info   Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    PadStrideInfo    _info;
-    DataLayout       _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
deleted file mode 100644
index 292c561e46..0000000000
--- a/arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
-#define ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the OpenCL kernel to be used for reshaping the tensor before returning the result of deconvolution.
- *
- * The input tensor to this OpenCL kernel is expected to be the result of a @ref CLGEMM operation between the Deconvolution input and the Deconvolution filter.
- *
- * The input tensor should have the following shape: [filter_width * filter_height * ofms, width, height, batch_size]
- *
- * The output tensor should have the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
- *
- * For example, given a tensor with dimensions [4, 2, 2] this function returns a tensor with dimensions [1, 4, 4].
- *
- */
-class CLDeconvolutionReshapeOutputKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLDeconvolutionReshapeOutputKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionReshapeOutputKernel(const CLDeconvolutionReshapeOutputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDeconvolutionReshapeOutputKernel &operator=(const CLDeconvolutionReshapeOutputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDeconvolutionReshapeOutputKernel(CLDeconvolutionReshapeOutputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDeconvolutionReshapeOutputKernel &operator=(CLDeconvolutionReshapeOutputKernel &&) = default;
-    /** Default destructor */
-    ~CLDeconvolutionReshapeOutputKernel() = default;
-
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  input        Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  bias         Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[out] output       Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
-     *                          Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  input_info   Deconvolution input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
-    /** Initialise the kernel's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in]  bias            Bias tensor to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[out] output          Output tensor with the following shape: [stride_x * (input_width - 1) + filter_width - 2 * padx, stride_y * (input_height - 1) + filter_height - 2 * pady, ofms, batch_size]
-     *                             Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  input_info      Deconvolution input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
-     *
-     * @param[in] input        GEMM output tensor info to be reshaped. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in] bias         (Optional) Optional bias tensor info to be added directly during the reshape operation. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] output       Reshaped output tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] input_info   Original input tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] weights_info Original weights tensor info output. Supported data types: same as @p input.  Supported data layouts: same as @p input.
-     * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    bool             _add_bias;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDECONVOLUTIONLAYERRESHAPEOUTPUTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
deleted file mode 100644
index 5fe826d090..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLDepthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthConcatenateLayerKernel &operator=(const CLDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel(CLDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthConcatenateLayerKernel &operator=(CLDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset    The offset on the Z axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int depth_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLDepthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _depth_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
deleted file mode 100644
index 66eb6222b2..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-#define ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth conversion kernel. */
-class CLDepthConvertLayerKernel : public ICLSimple3DKernel
-{
-public:
-    /** Set the input and output of the kernel.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy Conversion policy
-     * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
-    /** Set the input and output of the kernel.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[out] output          The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy
-     * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] policy Conversion policy
-     * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHCONVERTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
deleted file mode 100644
index 87ac3c1ec1..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the depth to space kernel */
-class CLDepthToSpaceLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthToSpaceLayerKernel(const CLDepthToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthToSpaceLayerKernel &operator=(const CLDepthToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthToSpaceLayerKernel(CLDepthToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthToSpaceLayerKernel &operator=(CLDepthToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDepthToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape value.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape     Block shape value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-    int32_t          _block_shape; /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
deleted file mode 100644
index 6cf0326467..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H
-
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NCHW.
- */
-class CLDepthwiseConvolutionLayer3x3NCHWKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NCHWKernel();
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for QASYMM8 supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NCHWKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [3, 3, IFM].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] gpu_target         (Optional) GPU target to validate the kernel for. Defaults to midgard.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD,
-                           const Size2D &dilation = Size2D(1U, 1U), const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int _conv_stride_x;
-    unsigned int _conv_pad_top;
-    unsigned int _conv_pad_left;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNCHWKERNEL3x3_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
deleted file mode 100644
index e564cf6fe0..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H
-
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor when the data layout is NHWC.
- */
-class CLDepthwiseConvolutionLayer3x3NHWCKernel : public ICLDepthwiseConvolutionLayer3x3Kernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayer3x3NHWCKernel();
-    /** Default move assignment operator. */
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, 3, 3].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input              Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, 3, 3].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int _num_rows_processed_per_iteration;
-    unsigned int _num_planes_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONNHWCKERNEL3x3_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
deleted file mode 100644
index 8847cf9c46..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a MxN depthwise convolution. M and N are respectively the rows and columns of the filter
-    This kernel assumes that tensor for the weights is NOT reshaped (Native version) */
-class CLDepthwiseConvolutionLayerNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLDepthwiseConvolutionLayerNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerNativeKernel(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerNativeKernel &operator=(const CLDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseConvolutionLayerNativeKernel(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDepthwiseConvolutionLayerNativeKernel &operator=(CLDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Initialize the function's source, destination and parameters
-     *
-     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialize the function's source, destination and parameters
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [IFM, N, M].
-     *                                Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in]  dwc_info           Depthwise convolution layer info
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                   const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                   const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
-     *
-     * @param[in] input              Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/FP32/FP16. Data layout supported: NHWC
-     * @param[in] weights            Weights tensor info. A 3D tensor with dimensions [IFM, N, M].
-     *                               Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases             Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                               Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output             Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] dwc_weights_info   Depthwise convolution layer weights info to retrieve the number of output elements processed by each thread
-     * @param[in] dwc_info           Depthwise convolution layer info
-     * @param[in] conv_info          Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in] output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                               the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCWeightsKernelInfo &dwc_weights_info,
-                           const DWCKernelInfo &dwc_info, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U),
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-    unsigned int     _depth_multiplier;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h b/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
deleted file mode 100644
index 8dc5d32e4f..0000000000
--- a/arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-#define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to reshape the weights of depthwise convolution. */
-class CLDepthwiseConvolutionLayerReshapeWeightsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(const CLDepthwiseConvolutionLayerReshapeWeightsKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-    /** Default move assignment operator */
-    CLDepthwiseConvolutionLayerReshapeWeightsKernel &operator=(CLDepthwiseConvolutionLayerReshapeWeightsKernel &&) = default;
-
-    /** Initialize the function's source and destination.
-     *
-     * @param[in]  input  The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[out] output The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in]  info   Depthwise convolution information to reshape the input tensor.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-    /** Initialize the function's source and destination.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[out] output          The output tensor of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in]  info            Depthwise convolution information to reshape the input tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const DepthwiseConvolutionReshapeInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3NHWCKernel
-     *
-     * @param[in] input  The input tensor info of dimension [IFM, W, H]. Data types supported: All. Data layouts supported: NHWC
-     * @param[in] output The output tensor info of dimension [W*H*C0, ceil(IFM/C0)]. C0 is the number of channels read by each thread. Data types supported: same as @p weights.
-     * @param[in] info   Depthwise convolution information to reshape the input tensor.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const DepthwiseConvolutionReshapeInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-
-    void configure_dot_product(const DepthwiseConvolutionReshapeInfo &info);
-    void configure_generic(const DepthwiseConvolutionReshapeInfo &info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERRESHAPEWEIGHTSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDerivativeKernel.h b/arm_compute/core/CL/kernels/CLDerivativeKernel.h
deleted file mode 100644
index cd8ae90c2d..0000000000
--- a/arm_compute/core/CL/kernels/CLDerivativeKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_CLDERIVATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the derivative kernel. */
-class CLDerivativeKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel(const CLDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDerivativeKernel &operator=(const CLDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel(CLDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDerivativeKernel &operator=(CLDerivativeKernel &&) = default;
-    /** Default destructor */
-    ~CLDerivativeKernel() = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;            /**< Input tensor */
-    ICLTensor       *_output_x;         /**< Output tensor - Derivate along the X direction */
-    ICLTensor       *_output_y;         /**< Output tensor - Derivate along the Y direction */
-    bool             _run_derivative_x; /**< Do we need to run Derivative X ? */
-    bool             _run_derivative_y; /**< Do we need to run Derivative Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDERIVATIVEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDilateKernel.h b/arm_compute/core/CL/kernels/CLDilateKernel.h
deleted file mode 100644
index 45f5fe0764..0000000000
--- a/arm_compute/core/CL/kernels/CLDilateKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDILATEKERNEL_H
-#define ARM_COMPUTE_CLDILATEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the dilate kernel.
- *
- */
-class CLDilateKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDILATEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h b/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
deleted file mode 100644
index 489d7c27c5..0000000000
--- a/arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the  direct convolution kernel.
- */
-class CLDirectConvolutionLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerKernel(const CLDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDirectConvolutionLayerKernel &operator=(const CLDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerKernel(CLDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLDirectConvolutionLayerKernel &operator=(CLDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDirectConvolutionLayerKernel() = default;
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2, data_layout=NHWC
-     *
-     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                       Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Set the input, weights, biases and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
-     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
-     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2, data_layout=NHWC
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                             Data type supported:Same as @p input.
-     * @param[in]  biases          Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
-     * @param[out] output          Output tensor.
-     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayerKernel
-     *
-     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] biases    Biases tensor. Biases are 1D tensor with dimension [OFM].
-     *                      Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] target    Target GPU architecture.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    const ICLTensor *_weights;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    int              _conv_stride_x;
-    int              _conv_stride_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h b/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
deleted file mode 100644
index e190bdebbe..0000000000
--- a/arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
-#define ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the elementwise unary operator */
-class CLElementWiseUnaryLayerKernel : public ICLSimpleKernel
-{
-public:
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  input  First tensor input. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: Same as @p input.
-     * @param[in]  op     Element wise unary operation to perform.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op);
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: F16/F32.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     * @param[in]  op              Element wise unary operation to perform.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ElementWiseUnary &op);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLElementWiseUnaryLayerKernel
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     * @param[in] op     Element wise unary operation to perform.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ElementWiseUnary &op);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLELEMENTWISEUNARYLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h b/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
deleted file mode 100644
index 4d3d4bc834..0000000000
--- a/arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
-#define ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
- *
- */
-class CLElementwiseOperationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLElementwiseOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLElementwiseOperationKernel(const CLElementwiseOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLElementwiseOperationKernel &operator=(const CLElementwiseOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLElementwiseOperationKernel(CLElementwiseOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLElementwiseOperationKernel &operator=(CLElementwiseOperationKernel &&) = default;
-    /** Default destructor */
-    ~CLElementwiseOperationKernel() = default;
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    BorderSize border_size() const override;
-
-protected:
-    /** The name of the operation */
-    virtual std::string name() = 0;
-
-    /** Initialise the kernel's output.
-     *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     *
-     * @return a pair of Status and Window
-     */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) = 0;
-
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    virtual Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
-
-    /** Generate the build options for the specific kernel
-     *
-     * @reutrn a CLBuildOptions struct
-     */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) = 0;
-
-    /** Generate the identifier for tuning
-     *
-     * @reutrn a string
-     */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) = 0;
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-
-    ActivationLayerInfo _act_info;
-
-private:
-    const ICLTensor *_input1; /**< Source tensor 1 */
-    const ICLTensor *_input2; /**< Source tensor 2 */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-
-/** Addition operation */
-class CLSaturatedArithmeticOperationKernel : public CLElementwiseOperationKernel
-{
-public:
-    CLSaturatedArithmeticOperationKernel()
-        : CLElementwiseOperationKernel(), _policy(), _op()
-    {
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
-     * @param[in] policy   Policy to use to handle overflow.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
-     * @param[in] policy          Policy to use to handle overflow.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
-     * @param[in] policy   Policy to use to handle overflow.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
-
-private:
-    ConvertPolicy       _policy;
-    ArithmeticOperation _op;
-};
-
-class CLArithmeticOperationKernel : public CLElementwiseOperationKernel
-{
-public:
-    CLArithmeticOperationKernel()
-        : CLElementwiseOperationKernel(), _op()
-    {
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor. Data types supported: Same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] op              Arithmetic operation to be executed.
-     * @param[in] input1          First tensor input. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2          Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output          Output tensor. Data types supported: Same as @p input1.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel
-     *
-     * @param[in] op       Arithmetic operation to be executed.
-     * @param[in] input1   First tensor input info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/QSYMM16/F16/U32/S32/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output   Output tensor info. Data types supported: Same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-protected:
-    // Inherited methods overridden:
-    std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
-
-private:
-    ArithmeticOperation _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLErodeKernel.h b/arm_compute/core/CL/kernels/CLErodeKernel.h
deleted file mode 100644
index cbc748194c..0000000000
--- a/arm_compute/core/CL/kernels/CLErodeKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLERODEKERNEL_H
-#define ARM_COMPUTE_CLERODEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the erode kernel.
- *
- */
-class CLErodeKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /**Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLERODEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h b/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
deleted file mode 100644
index a8da1246bb..0000000000
--- a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
-#define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the digit reverse operation kernel. */
-class CLFFTDigitReverseKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLFFTDigitReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTDigitReverseKernel(const CLFFTDigitReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTDigitReverseKernel &operator=(const CLFFTDigitReverseKernel &) = delete;
-    /** Default Move Constructor. */
-    CLFFTDigitReverseKernel(CLFFTDigitReverseKernel &&) = default;
-    /** Default move assignment operator */
-    CLFFTDigitReverseKernel &operator=(CLFFTDigitReverseKernel &&) = default;
-    /** Default destructor */
-    ~CLFFTDigitReverseKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F32.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
-     * @param[in]  config Kernel configuration.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F32.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
-     * @param[in]  config          Kernel configuration.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32.
-     * @param[in] output Destination tensor info. Data type supported: same as @p input
-     * @param[in] idx    Digit reverse index tensor info. Data type supported: U32
-     * @param[in] config Kernel configuration.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_idx;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h b/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
deleted file mode 100644
index e3f53462d9..0000000000
--- a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
-#define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-#include <set>
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the FFT radix stage kernel. */
-class CLFFTRadixStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLFFTRadixStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTRadixStageKernel(const CLFFTRadixStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTRadixStageKernel &operator=(const CLFFTRadixStageKernel &) = delete;
-    /** Default Move Constructor. */
-    CLFFTRadixStageKernel(CLFFTRadixStageKernel &&) = default;
-    /** Default move assignment operator */
-    CLFFTRadixStageKernel &operator=(CLFFTRadixStageKernel &&) = default;
-    /** Default destructor */
-    ~CLFFTRadixStageKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is nullptr, the FFT will be performed in-place
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32.
-     * @param[out]    output Destination tensor. Can be nullptr. Data type supported: same as @p input
-     * @param[in]     config FFT descriptor metadata.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is nullptr, the FFT will be performed in-place
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: F32.
-     * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
-     * @param[in]     config          FFT descriptor metadata.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32.
-     * @param[in] output Destination tensor info. Can be nullptr. Data type supported: same as @p input
-     * @param[in] config FFT descriptor metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
-    /** Returns the radix that are support by the FFT kernel
-     *
-     * @return A set of supported radix
-     */
-    static std::set<unsigned int> supported_radix();
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h b/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
deleted file mode 100644
index d0d2b7613c..0000000000
--- a/arm_compute/core/CL/kernels/CLFFTScaleKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
-#define ARM_COMPUTE_CLFFTSCALEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the inverse fft scale kernel. */
-class CLFFTScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLFFTScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTScaleKernel(const CLFFTScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFFTScaleKernel &operator=(const CLFFTScaleKernel &) = delete;
-    /** Default Move Constructor. */
-    CLFFTScaleKernel(CLFFTScaleKernel &&) = default;
-    /** Default move assignment operator */
-    CLFFTScaleKernel &operator=(CLFFTScaleKernel &&) = default;
-    /** Default destructor */
-    ~CLFFTScaleKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32.
-     * @param[out]    output Destination tensor. Data type supported: same as @p input
-     * @param[in]     config Kernel configuration
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
-    /** Set the input and output tensors.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: F32.
-     * @param[out]    output          Destination tensor. Data type supported: same as @p input
-     * @param[in]     config          Kernel configuration
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32.
-     * @param[in] output Destination tensor info. Data type supported: same as @p input
-     * @param[in] config Kernel configuration
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFFTSCALEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFastCornersKernel.h b/arm_compute/core/CL/kernels/CLFastCornersKernel.h
deleted file mode 100644
index 1a0d4e36a5..0000000000
--- a/arm_compute/core/CL/kernels/CLFastCornersKernel.h
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_CLFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** CL kernel to perform fast corners */
-class CLFastCornersKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel(const CLFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCornersKernel &operator=(const CLFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel(CLFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFastCornersKernel &operator=(CLFastCornersKernel &&) = default;
-    /** Default destructor */
-    ~CLFastCornersKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  input               Source image. Data types supported: U8.
-     * @param[out] output              Output image. Data types supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_mode         Strategy to use for borders.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output, float threshold, bool non_max_suppression, BorderMode border_mode);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    ICLImage       *_output;
-};
-
-/** CL kernel to copy keypoints information to ICLKeyPointArray and counts the number of key points */
-class CLCopyToArrayKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLCopyToArrayKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel(const CLCopyToArrayKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCopyToArrayKernel &operator=(const CLCopyToArrayKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel(CLCopyToArrayKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCopyToArrayKernel &operator=(CLCopyToArrayKernel &&) = default;
-    /** Default destructor */
-    ~CLCopyToArrayKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input         Source image. Data types supported: U8.
-     * @param[in]  update_number Flag to indicate whether we need to update the number of corners
-     * @param[out] corners       Array of keypoints to store the results.
-     * @param[out] num_buffers   Number of keypoints to store the results.
-     */
-    void configure(const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-    /** Initialise the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[in]  update_number   Flag to indicate whether we need to update the number of corners
-     * @param[out] corners         Array of keypoints to store the results.
-     * @param[out] num_buffers     Number of keypoints to store the results.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, bool update_number, ICLKeyPointArray *corners, cl::Buffer *num_buffers);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage   *_input;      /**< source image */
-    ICLKeyPointArray *_corners;    /**< destination array */
-    cl::Buffer       *_num_buffer; /**< CL memory to record number of key points in the array */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLFASTCORNERSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFillBorderKernel.h b/arm_compute/core/CL/kernels/CLFillBorderKernel.h
deleted file mode 100644
index d00ea55a83..0000000000
--- a/arm_compute/core/CL/kernels/CLFillBorderKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFILLBORDERKERNEL_H
-#define ARM_COMPUTE_CLFILLBORDERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for filling the border of a kernel */
-class CLFillBorderKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFillBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFillBorderKernel(const CLFillBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFillBorderKernel &operator=(const CLFillBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFillBorderKernel(CLFillBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFillBorderKernel &operator=(CLFillBorderKernel &&) = default;
-    /** Default destructor */
-    ~CLFillBorderKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-
-    /** Function to set the constant value on fill border kernel depending on type.
-     *
-     * @param[in] idx                   Index of the kernel argument to set.
-     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    template <class T>
-    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    bool is_parallelisable() const override;
-
-private:
-    ICLTensor *_tensor;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFILLBORDERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h b/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
deleted file mode 100644
index ab009e1aa8..0000000000
--- a/arm_compute/core/CL/kernels/CLFlattenLayerKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
-#define ARM_COMPUTE_CLFLATTENLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL interface for the flatten kernel.*/
-class CLFlattenLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFlattenLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFlattenLayerKernel(const CLFlattenLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFlattenLayerKernel &operator=(const CLFlattenLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFlattenLayerKernel(CLFlattenLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFlattenLayerKernel &operator=(CLFlattenLayerKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First input tensor to flatten with at least 3 dimensions.
-     *                             The dimensions above the third will be interpreted as batches. Data types supported: All.
-     * @param[out] output          Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFlattenLayerKernel
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFLATTENLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFloorKernel.h b/arm_compute/core/CL/kernels/CLFloorKernel.h
deleted file mode 100644
index 4d1ed789db..0000000000
--- a/arm_compute/core/CL/kernels/CLFloorKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFLOORKERNEL_H
-#define ARM_COMPUTE_CLFLOORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a floor operation */
-class CLFloorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFloorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFloorKernel(const CLFloorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFloorKernel &operator=(const CLFloorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFloorKernel(CLFloorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFloorKernel &operator=(CLFloorKernel &&) = default;
-    /** Default destructor */
-    ~CLFloorKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: F16/F32.
-     * @param[out] output Destination tensor. Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: F16/F32.
-     * @param[out] output          Destination tensor. Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFloorKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: F16/F32.
-     * @param[in] output Destination tensor info. Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFLOORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
deleted file mode 100644
index 2fe6b223ca..0000000000
--- a/arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel to fuse the batch normalization node to a preceding convolution node */
-class CLFuseBatchNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLFuseBatchNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFuseBatchNormalizationKernel(const CLFuseBatchNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFuseBatchNormalizationKernel &operator=(const CLFuseBatchNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLFuseBatchNormalizationKernel(CLFuseBatchNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLFuseBatchNormalizationKernel &operator=(CLFuseBatchNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLFuseBatchNormalizationKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
-     * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
-     * @param[out] fused_weights Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[out] fused_bias    Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in]  input_bias    (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in]  bn_beta       (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_beta is set to 0.0
-     * @param[in]  bn_gamma      (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_gamma is set to 1.0
-     * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input_weights   Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in]  bn_mean         Batch normalization layer mean tensor. Same as @p input_weights
-     * @param[in]  bn_var          Batch normalization layer variance tensor. Same as @p input_weights
-     * @param[out] fused_weights   Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[out] fused_bias      Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in]  input_bias      (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in]  bn_beta         (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                             @note if nullptr, bn_beta is set to 0.0
-     * @param[in]  bn_gamma        (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                             @note if nullptr, bn_gamma is set to 1.0
-     * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
-     *
-     * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in] bn_mean       Batch normalization layer mean tensor info. Same as @p input_weights
-     * @param[in] bn_var        Batch normalization layer variance tensor info. Same as @p input_weights
-     * @param[in] fused_weights Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[in] fused_bias    Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in] input_bias    (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in] bn_beta       (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_beta is set to 0.0
-     * @param[in] bn_gamma      (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_gamma is set to 1.0
-     * @param[in] epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in] fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input_weights;
-    const ICLTensor *_input_bias;
-    const ICLTensor *_bn_mean;
-    const ICLTensor *_bn_var;
-    const ICLTensor *_bn_gamma;
-    const ICLTensor *_bn_beta;
-    ICLTensor       *_fused_weights;
-    ICLTensor       *_fused_bias;
-    float            _epsilon;
-    bool             _run_in_place_weights;
-    bool             _run_in_place_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
deleted file mode 100644
index 15fd20842e..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
-class CLGEMMLowpMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(const CLGEMMLowpMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyNativeKernel &operator=(CLGEMMLowpMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns to be processed by each thread
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same as lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index 43526b7c41..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMLowpMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same as lhs_info.k0
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same as lhs_info.k0
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMReshapeInfo &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor info containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] output    Output tensor info. Data type supported: S32
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index 1aba6c0398..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
- */
-class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input0             Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  input1             Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                                Only the following values are supported for LHS info:
-     *                                lhs_info.m0: 2,3,4,5,6,7,8
-     *                                lhs_info.k0: 2,3,4,8,16
-     *                                Only the following values are supported for RHS info:
-     *                                rhs_info.n0: 2,3,4,8,16
-     *                                rhs_info.k0: same as lhs_info.k0
-     *                                rhs_info.transpose: true
-     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMKernelInfo &gemm_info, const ICLTensor *vector_sum_col = nullptr,
-                   const ICLTensor *vector_sum_row = nullptr, const ICLTensor *bias = nullptr, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0             Input tensor info for the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] input1             Input tensor info for the RHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output             Output tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
-     * @param[in] gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
-     *                               Only the following values are supported for LHS info:
-     *                               lhs_info.m0: 2,3,4,5,6,7,8
-     *                               lhs_info.k0: 2,3,4,8,16
-     *                               Only the following values are supported for RHS info:
-     *                               rhs_info.n0: 2,3,4,8,16
-     *                               rhs_info.k0: same as lhs_info.k0
-     *                               rhs_info.transpose: true
-     * @param[in] vector_sum_col     (Optional) Input row-vector info of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
-     * @param[in] vector_sum_row     (Optional) Input row-vector info of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
-     * @param[in] bias               (Optional) Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
-     * @param[in] output_multipliers (Optional) Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     * @param[in] output_shifts      (Optional) Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMKernelInfo &gemm_info, const ITensorInfo *vector_sum_col = nullptr,
-                           const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, const ITensorInfo *output_multipliers = nullptr,
-                           const ITensorInfo *output_shifts = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _is_quantized_per_channel;
-    bool             _fuse_output_stage;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index bc982c6120..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class CLGEMMLowpOffsetContributionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionKernel &operator=(const CLGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
-     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
-     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]      k               Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
-                   int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    ICLTensor       *_mm_result;
-    const ICLTensor *_bias;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 583b388d45..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
- * of matrix A and matrix B and performs the output stage defined by the output_stage argument
- *
- * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
- */
-class CLGEMMLowpOffsetContributionOutputStageKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(const CLGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
-     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  k                  Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage       GEMMLowp output stage info
-     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                                Supported data types: S32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k,
-                   int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
-     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
-     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
-     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output             Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] a_offset           Offset to be added to each element of the matrix A.
-     * @param[in] b_offset           Offset to be added to each element of the matrix B.
-     * @param[in] output_stage       GEMMLowp output stage info
-     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
-     *                               Supported data types: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_mm_result;
-    const ICLTensor *_vector_sum_col;
-    const ICLTensor *_vector_sum_row;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized_per_channel;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
deleted file mode 100644
index 1e9fde8376..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S32
-     * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info   Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  info            Output stage info. Used to pass the quantized output data type
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] info   Output stage info. Used to pass the quantized output data type
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEBYFLOATKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index 766ef9a820..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8/SIGNED.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ScaleKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const CLGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ScaleKernel &operator=(CLGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  output_stage    GEMMLowp output stage metadata.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] input        Input tensor. Data type supported: S32
-     * @param[in] bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                         Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor               *_input;
-    const ICLTensor               *_bias;
-    ICLTensor                     *_output;
-    const GEMMLowpOutputStageInfo *_output_stage;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index 6f58150037..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index 0c237be34c..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- */
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index cb3e12e34d..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- */
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
deleted file mode 100644
index 857b1c7952..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all OpenCL reduction kernels */
-class ICLGEMMLowpReductionKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    ICLGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    ICLGEMMLowpReductionKernel &operator=(const ICLGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel(ICLGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICLGEMMLowpReductionKernel &operator=(ICLGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: S8
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S8
-     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixAReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_a, ICLTensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class CLGEMMLowpMatrixBReductionKernel : public ICLGEMMLowpReductionKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            Number of matrix columns/rows depending on the type of reduction.
-     *                            - is_reshaped  True if the matrix has been reshaped.
-     *                            - scalar       Scalar value to multiply each reduced column/row by.
-     *                            - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info            Kernel metadata:
-     *                             - k            Number of matrix columns/rows depending on the type of reduction.
-     *                             - is_reshaped  True if the matrix has been reshaped.
-     *                             - scalar       Scalar value to multiply each reduced column/row by.
-     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *mtx_b, ICLTensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            Number of matrix columns/rows depending on the type of reduction.
-     *                           - is_reshaped  True if the matrix has been reshaped.
-     *                           - scalar       Scalar value to multiply each reduced column/row by.
-     *                           - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMLOWREDUCTIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index df2f6f4ad1..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** Interface to add a bias to each row of the input tensor
- *
- */
-class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixAccumulateBiasesKernel &operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(ICLTensor *accum, const ICLTensor *biases);
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] accum           The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases          The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *accum, const ICLTensor *biases);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel
-     *
-     * @param[in] accum      The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     * @param[in] gpu_target GPU target
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor       *_accum;
-    const ICLTensor *_biases;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 6085b34bcb..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result.
- *  For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object
- *
- * @note If the input tensors @p input0 and @p input1 have been reshaped respectively with @ref CLGEMMReshapeLHSMatrixKernel" and @ref CLGEMMReshapeRHSMatrixKernel,
- *       the flag @p is_interleaved_transposed must be set to true
- *
- * @attention @p input1 tensor must have at least 2 dimensions (matrix)
- *
- */
-class CLGEMMMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyKernel(const CLGEMMMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyKernel &operator=(const CLGEMMMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyKernel(CLGEMMMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyKernel &operator=(CLGEMMMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context           The compile context to be used.
-     * @param[in]  input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in]  input2                    Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  beta                      (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in]  fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in]  activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta = 0.f,
-                   bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0                    Input tensor containing the Matrix A info. Data types supported: F16/F32
-     * @param[in] input1                    Input tensor containing the Matrix B info. Data type supported: same as @p input0
-     * @param[in] input2                    Input tensor containing the Matrix C (bias) info. Can be nullptr. Data type supported: same as @p input0
-     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha                     Weight of the matrix product
-     * @param[in] beta                      Weight of vector C. Default value is 0. Only beta = 1 is currently supported.
-     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMReshapeLHSMatrixKernel and @ref CLGEMMReshapeRHSMatrixKernel
-     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in] gpu_target                GPU Target
-     * @param[in] fp_mixed_precision        (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy
-     * @param[in] activation_info           (Optional) Activation to apply after the matrix multiplication
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta,
-                           bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
deleted file mode 100644
index c711a3d1f9..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
-class CLGEMMMatrixMultiplyNativeKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyNativeKernel(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyNativeKernel &operator=(const CLGEMMMatrixMultiplyNativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyNativeKernel(CLGEMMMatrixMultiplyNativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyNativeKernel &operator=(CLGEMMMatrixMultiplyNativeKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor info. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: same of lhs_info.k0
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyNativeKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info for the RHS matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
-     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     * @param[in] rhs_info  RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: same of lhs_info.k0
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
deleted file mode 100644
index ee8e57fa8c..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when both the input matrices LHS (input0) and RHS (input1) have been reshaped
- *
- * @note The input matrices @p input0 and @p input1 must be reshaped through @ref CLGEMMReshapeLHSMatrixKernel and  @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyReshapedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedKernel(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedKernel &operator=(const CLGEMMMatrixMultiplyReshapedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedKernel(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedKernel &operator=(CLGEMMMatrixMultiplyReshapedKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @param[in]  input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                       lhs_info.m0: 2,3,4,5,6,7,8
-     *                       lhs_info.k0: 2,3,4,8,16
-     *                       lhs_info.transpose: false
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.k0: 2,3,4,8,16
-     *                       rhs_info.transpose: true
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
-     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
-     *       multiplications. i.e. float c = (half)a * (half)b
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                             lhs_info.m0: 2,3,4,5,6,7,8
-     *                             lhs_info.k0: 2,3,4,8,16
-     *                             lhs_info.transpose: false
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.transpose: true
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedKernel
-     *
-     * @param[in] input0    Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4
-     * @param[in] input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used for reshaping the input0 tensor.  Only the following values are supported:
-     *                      lhs_info.m0: 2,3,4,5,6,7,8
-     *                      lhs_info.k0: 2,3,4,8,16
-     *                      lhs_info.transpose: false
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.transpose: true
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @note lhs_info.k0 must be equal to rhs_info.k0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_output_as_3d;
-    unsigned int     _k;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H*/
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
deleted file mode 100644
index f7d314a039..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices when only the input matrix RHS (input1) has been reshaped
- *
- * @note The input matrix input1 must be reshaped through @ref CLGEMMReshapeRHSMatrixKernel
- */
-class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(const CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &operator=(CLGEMMMatrixMultiplyReshapedOnlyRHSKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1    Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2    Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of the matrix bias
-     * @param[in]  lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                       lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                       rhs_info.k0: 2,3,4,8,16
-     *                       rhs_info.n0: 2,3,4,8,16
-     *                       rhs_info.transpose: true,false
-     * @param[in]  gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in]  input1          Input tensor containing the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in]  input2          Input tensor containing the bias matrix. Data type supported: same as @p input0.
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha           Weight of the matrix product
-     * @param[in]  beta            Weight of the matrix bias
-     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in]  rhs_info        RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                             rhs_info.k0: 2,3,4,8,16
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.transpose: true,false
-     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info,
-                   const GEMMRHSMatrixInfo &rhs_info,
-                   const GEMMKernelInfo    &gemm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel
-     *
-     * @param[in] input0    Input tensor info for the LHS matrix. Data type supported: F16/F32. The number of dimensions for the LHS matrix must be less or equal than 4.
-     * @param[in] input1    Input tensor info for the RHS reshaped matrix. Data type supported: same as @p input0. The number of dimensions for the RHS matrix must be less or equal than 3.
-     * @param[in] input2    Input tensor info containing the bias matrix. Data type supported: same as @p input0.
-     * @param[in] output    Output tensor info. Data type supported: same as @p input0
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of the matrix bias
-     * @param[in] lhs_info  LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
-     *                      lhs_info.m0: 1,2,3,4,5,6,7,8
-     * @param[in] rhs_info  RHS matrix information used for reshaping the input1 tensor.  Only the following values are supported:
-     *                      rhs_info.k0: 2,3,4,8,16
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.transpose: true,false
-     * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
-                           const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMKernelInfo    &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-    bool             _use_dummy_work_items;
-    bool             _add_bias;
-    bool             _broadcast_bias;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
deleted file mode 100644
index 6d70b4b0c2..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the GEMM matrix vector multiply kernel. **/
-class CLGEMMMatrixVectorMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMMatrixVectorMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixVectorMultiplyKernel(const CLGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMMatrixVectorMultiplyKernel &operator=(const CLGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixVectorMultiplyKernel(CLGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMMatrixVectorMultiplyKernel &operator=(CLGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input0 The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1 The 2D reshaped weights tensor. Data type supported: Same as @p input.
-     * @param[out] output The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          The reshaped input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1          The 2D reshaped weights tensor. Data type supported: Same as @p input.
-     * @param[out] output          The output 2D tensor. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixVectorMultiplyKernel
-     *
-     * @param[in] input0 The reshaped input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] input1 The 2D reshaped weights tensor info. Data type supported: Same as @p input.
-     * @param[in] output The output 2D tensor info. Data types supported: Same as @p input, S32 for QASYMM8/QASYMM8_SIGNED.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    int              _num_rows_read_per_iteration;
-    BorderSize       _border_size;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLGEMMMATRIXVECTORMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
deleted file mode 100644
index fe77fcb428..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
- *  In particular, this function splits the input matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
- *  stores each one in the output matrix unrolling the values
- */
-class CLGEMMReshapeLHSMatrixKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeLHSMatrixKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeLHSMatrixKernel(const CLGEMMReshapeLHSMatrixKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeLHSMatrixKernel &operator=(const CLGEMMReshapeLHSMatrixKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeLHSMatrixKernel(CLGEMMReshapeLHSMatrixKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeLHSMatrixKernel &operator=(CLGEMMReshapeLHSMatrixKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                   Input tensor. Data types supported: All
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                     information to reshape the input tensor. Only the following values are supported:
-     *                                     lhs_info.m0: 2,3,4,5,6,7,8
-     *                                     lhs_info.k0: 2,3,4,8,16
-     *                                     lhs_info.v0: greater than 0
-     *                                     lhs_info.transpose: true, false
-     *                                     lhs_info.interleave: true, false
-     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor. Data types supported: All
-     * @param[out] output                  Output tensor. Data type supported: same as @p input
-     * @param[in]  lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                     information to reshape the input tensor. Only the following values are supported:
-     *                                     lhs_info.m0: 2,3,4,5,6,7,8
-     *                                     lhs_info.k0: 2,3,4,8,16
-     *                                     lhs_info.v0: greater than 0
-     *                                     lhs_info.transpose: true, false
-     *                                     lhs_info.interleave: true, false
-     * @param[in]  reinterpret_input_as_3d (Optional) True if the input has to be reinterpreted as 3D tensor
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeLHSMatrixKernel
-     *
-     * @param[in] input                   Input tensor info. Data types supported: All
-     * @param[in] output                  Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] lhs_info                LHS matrix information to be used for reshaping. This object contains all the necessary
-     *                                    information to reshape the input tensor. Only the following values are supported:
-     *                                    lhs_info.m0: 2,3,4,5,6,7,8
-     *                                    lhs_info.k0: 2,3,4,8,16
-     *                                    lhs_info.v0: greater than 0
-     *                                    lhs_info.transpose: true, false
-     *                                    lhs_info.interleave: true, false
-     * @param[in] reinterpret_input_as_3d True if the input has to be reinterpreted as 3D tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    bool             _reinterpret_input_as_3d;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPELHSMATRIXKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
deleted file mode 100644
index 0e6352bdbb..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-#define ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
- *  In particular, this kernel splits the input matrix in blocks of size K0xN0 and stores each one in
- *  the output matrix unrolling the values */
-class CLGEMMReshapeRHSMatrixKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeRHSMatrixKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernel(const CLGEMMReshapeRHSMatrixKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernel &operator=(const CLGEMMReshapeRHSMatrixKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeRHSMatrixKernel(CLGEMMReshapeRHSMatrixKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input    Input tensor. Data types supported: All
-     * @param[out] output   Output tensor. Data type supported: same as @p input
-     * @param[in]  rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                      information to reshape the input tensor. Only the following values are supported:
-     *                      rhs_info.n0: 2,3,4,8,16
-     *                      rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
-     *                      rhs_info.h0: greater than 0
-     *                      rhs_info.transpose: true, false
-     *                      rhs_info.interleave: true, false
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All
-     * @param[out] output          Output tensor. Data type supported: same as @p input
-     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                             information to reshape the input tensor. Only the following values are supported:
-     *                             rhs_info.n0: 2,3,4,8,16
-     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
-     *                             rhs_info.h0: greater than 0
-     *                             rhs_info.transpose: true, false
-     *                             rhs_info.interleave: true, false
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
-     *
-     * @param[in] input    Input tensor info. Data types supported: All
-     * @param[in] output   Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
-     *                     information to reshape the input tensor. Only the following values are supported:
-     *                     rhs_info.n0: 2,3,4,8,16
-     *                     rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false)
-     *                     rhs_info.h0: greater than 0
-     *                     rhs_info.transpose: true, false
-     *                     rhs_info.interleave: true, false
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const GEMMRHSMatrixInfo &rhs_info);
-
-    // Inherited methods overridden
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMRESHAPERHSMATRIXKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLGatherKernel.h b/arm_compute/core/CL/kernels/CLGatherKernel.h
deleted file mode 100644
index b7539536e9..0000000000
--- a/arm_compute/core/CL/kernels/CLGatherKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGATHERKERNEL_H
-#define ARM_COMPUTE_CLGATHERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform tensor reshaping */
-class CLGatherKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLGatherKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGatherKernel(const CLGatherKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGatherKernel &operator=(const CLGatherKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGatherKernel(CLGatherKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGatherKernel &operator=(CLGatherKernel &&) = default;
-    /** Default destructor */
-    ~CLGatherKernel() = default;
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     */
-    void configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in]  indices         Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
-     * @param[out] output          Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
-     *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;   /**< Source tensor */
-    const ICLTensor *_indices; /**< Indices tensor */
-    ICLTensor       *_output;  /**< Destination tensor */
-    int              _axis;    /**< Axis index */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGATHERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h b/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
deleted file mode 100644
index 6a9d3eaa4d..0000000000
--- a/arm_compute/core/CL/kernels/CLGaussian3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Gaussian 3x3 filter kernel.
- *
- */
-class CLGaussian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h b/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
deleted file mode 100644
index d8730e0c92..0000000000
--- a/arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-#define ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H
-
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5HorKernel : public CLSeparableConvolution5x5HorKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5HorKernel::configure;
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Gaussian filter on a tensor. */
-class CLGaussian5x5VertKernel : public CLSeparableConvolution5x5VertKernel
-{
-public:
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Input tensor(output of horizontal pass). Data types supported: S16.
-     * @param[out] output           Destination tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-private:
-    //Make the configure method of the parent class private
-    using CLSeparableConvolution5x5VertKernel::configure;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIAN5X5KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h b/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
deleted file mode 100644
index 34cd062dae..0000000000
--- a/arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimpleKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across width (horizontal pass) */
-class CLGaussianPyramidHorKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel(const CLGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidHorKernel &operator=(const CLGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel(CLGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidHorKernel &operator=(CLGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor. Output should have half the input width. Data types supported: U16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** OpenCL kernel to perform a Gaussian filter and half scaling across height (vertical pass) */
-class CLGaussianPyramidVertKernel : public ICLSimpleKernel
-{
-public:
-    /** Default constructor */
-    CLGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel(const CLGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramidVertKernel &operator=(const CLGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel(CLGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramidVertKernel &operator=(CLGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~CLGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U16.
-     * @param[out] output          Destination tensor. Output should have half the input height. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMIDKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
deleted file mode 100644
index 46dc16d6d5..0000000000
--- a/arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
-#define ARM_COMPUTE_CLGENERATEPROPOSALSLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for Compute All Anchors kernel */
-class CLComputeAllAnchorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLComputeAllAnchorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComputeAllAnchorsKernel(const CLComputeAllAnchorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComputeAllAnchorsKernel &operator=(const CLComputeAllAnchorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComputeAllAnchorsKernel(CLComputeAllAnchorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComputeAllAnchorsKernel &operator=(CLComputeAllAnchorsKernel &&) = default;
-    /** Default destructor */
-    ~CLComputeAllAnchorsKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  anchors         Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[out] all_anchors     Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
-     *
-     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_anchors;
-    ICLTensor       *_all_anchors;
-};
-} // arm_compute
-#endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H
diff --git a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h b/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
deleted file mode 100644
index 046950551d..0000000000
--- a/arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** OpenCL kernel to perform HOG Orientation Binning */
-class CLHOGOrientationBinningKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel(const CLHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGOrientationBinningKernel &operator=(const CLHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel(CLHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGOrientationBinningKernel &operator=(CLHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. DataType supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_magnitude, const ICLTensor *input_phase, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input_magnitude;
-    const ICLTensor *_input_phase;
-    ICLTensor       *_output;
-    Size2D           _cell_size;
-};
-
-/** OpenCL kernel to perform HOG block normalization */
-class CLHOGBlockNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGBlockNormalizationKernel &operator=(const CLHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel(CLHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGBlockNormalizationKernel &operator=(CLHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output          Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _num_cells_per_block_stride;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDESCRIPTORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h b/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
deleted file mode 100644
index 681c212cc5..0000000000
--- a/arm_compute/core/CL/kernels/CLHOGDetectorKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_CLHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform HOG detector kernel using linear SVM */
-class CLHOGDetectorKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel(const CLHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetectorKernel &operator=(const CLHOGDetectorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel(CLHOGDetectorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGDetectorKernel &operator=(CLHOGDetectorKernel &&) = default;
-    /** Default destructor */
-    ~CLHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref CLHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref CLHOGOrientationBinningKernel and  @ref CLHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  num_detection_windows   Number of detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, cl::Buffer *num_detection_windows,
-                   const Size2D &detection_window_stride, float threshold = 0.0f,
-                   uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue);
-
-private:
-    const ICLTensor         *_input;
-    ICLDetectionWindowArray *_detection_windows;
-    cl::Buffer              *_num_detection_windows;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHOGDETECTORKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h b/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
deleted file mode 100644
index a13119b82c..0000000000
--- a/arm_compute/core/CL/kernels/CLHarrisCornersKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_CLHARRISCORNERSKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the harris score kernel.
- *
- * @note The implementation supports 3, 5, and 7 for the block_size.
- */
-class CLHarrisScoreKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel(const CLHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisScoreKernel &operator=(const CLHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel(CLHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHarrisScoreKernel &operator=(CLHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~CLHarrisScoreKernel() = default;
-
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-    /** Setup the kernel parameters
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input1           Source image (gradient X). Data types supported S16, S32. (Must be the same as input2)
-     * @param[in]  input2           Source image (gradient Y). Data types supported S16, S32. (Must be the same as input1)
-     * @param[out] output           Destination image (harris score). Data types supported F32
-     * @param[in]  block_size       The block window size used to compute the Harris Corner score.  Supports: 3, 5 and 7
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input1, const ICLImage *input2, ICLImage *output,
-                   int32_t block_size, float norm_factor, float strength_thresh, float sensitivity,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-protected:
-    const ICLImage *_input1;          /**< Source image - Gx component */
-    const ICLImage *_input2;          /**< Source image - Gy component */
-    ICLImage       *_output;          /**< Source image - Harris score */
-    float           _sensitivity;     /**< Sensitivity value */
-    float           _strength_thresh; /**< Threshold value */
-    float           _norm_factor;     /**< Normalization factor */
-    BorderSize      _border_size;     /**< Border size */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHARRISCORNERSKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
deleted file mode 100644
index 524e5ea997..0000000000
--- a/arm_compute/core/CL/kernels/CLHeightConcatenateLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the height concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLHeightConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLHeightConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHeightConcatenateLayerKernel &operator=(const CLHeightConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel(CLHeightConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHeightConcatenateLayerKernel &operator=(CLHeightConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLHeightConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input         Input tensor. Data types supported: All.
-     * @param[in]  height_offset The starting offset on the Y axis for the output tensor.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int height_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[in]  height_offset   The starting offset on the Y axis for the output tensor.
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int height_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLHeightConcatenateLayerKernel
-     *
-     * @param[in] input         Input tensor info. Data types supported: All.
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _height_offset;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLHistogramKernel.h b/arm_compute/core/CL/kernels/CLHistogramKernel.h
deleted file mode 100644
index 9cd374711b..0000000000
--- a/arm_compute/core/CL/kernels/CLHistogramKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_CLHISTOGRAMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLDistribution1D;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface to run the histogram kernel. This kernel processes the part of image with width can be divided by 16.
- *  If the image width is not a multiple of 16, remaining pixels have to be processed with the @ref CLHistogramBorderKernel
- */
-class CLHistogramKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel(const CLHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramKernel &operator=(const CLHistogramKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel(CLHistogramKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramKernel &operator=(CLHistogramKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-
-/** Interface to run the histogram kernel to handle the leftover part of image
- *
- */
-class CLHistogramBorderKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLHistogramBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel(const CLHistogramBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogramBorderKernel &operator=(const CLHistogramBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel(CLHistogramBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHistogramBorderKernel &operator=(CLHistogramBorderKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source image. Data types supported: U8.
-     * @param[out] output Destination distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8.
-     * @param[out] output          Destination distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage    *_input;
-    ICLDistribution1D *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLHISTOGRAMKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
deleted file mode 100644
index 61f2a3d489..0000000000
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLIM2COLKERNEL_H
-#define ARM_COMPUTE_CLIM2COLKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class CLIm2ColKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel(const CLIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIm2ColKernel &operator=(const CLIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel(CLIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIm2ColKernel &operator=(CLIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                         while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     *                         This is valid only for non-quantized inputs.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                         Number of groups other than 1 is only supported for NCHW data layout.
-     *                         Number of groups should be multiple to the number of channels.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                   unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[out] output          The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                             while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims     The kernel dimensions (width and height).
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                        while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     *                        This is valid only for non-quantized inputs.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution.
-     *                        Number of groups other than 1 is only supported for NCHW data layout.
-     *                        Number of groups should be multiple to the number of channels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLIM2COLKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
deleted file mode 100644
index 014dce1759..0000000000
--- a/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for performing an instance normalization */
-class CLInstanceNormalizationLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLInstanceNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLInstanceNormalizationLayerKernel(const CLInstanceNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLInstanceNormalizationLayerKernel &operator=(const CLInstanceNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLInstanceNormalizationLayerKernel(CLInstanceNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLInstanceNormalizationLayerKernel &operator=(CLInstanceNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLInstanceNormalizationLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in, out] input  Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
-     *                        In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      info   Kernel meta-data descriptor
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. Data types supported: F16/F32. Data layout supported: NCHW, NHWC
-     *                                 In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      info            Kernel meta-data descriptor
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
-     *
-     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layout supported: NHWC, NCHW
-     * @param[in] output Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] info   Kernel meta-data descriptor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h b/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
deleted file mode 100644
index 6b6076a917..0000000000
--- a/arm_compute/core/CL/kernels/CLIntegralImageKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to run the horizontal pass of the integral image kernel. */
-class CLIntegralImageHorKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8
-     * @param[out] output Destination tensor, Data types supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          Destination tensor, Data types supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-};
-
-/** Interface to run the vertical pass of the integral image kernel. */
-class CLIntegralImageVertKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLIntegralImageVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel(const CLIntegralImageVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLIntegralImageVertKernel &operator=(const CLIntegralImageVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel(CLIntegralImageVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLIntegralImageVertKernel &operator=(CLIntegralImageVertKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in,out] in_out The input/output tensor. Data types supported: U32
-     */
-    void configure(ICLTensor *in_out);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] in_out          The input/output tensor. Data types supported: U32
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *in_out);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_in_out;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLINTEGRALIMAGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h b/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
deleted file mode 100644
index 169910b70d..0000000000
--- a/arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
-#define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
-class CLL2NormalizeLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLL2NormalizeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLL2NormalizeLayerKernel(const CLL2NormalizeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLL2NormalizeLayerKernel &operator=(const CLL2NormalizeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLL2NormalizeLayerKernel(CLL2NormalizeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLL2NormalizeLayerKernel &operator=(CLL2NormalizeLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLL2NormalizeLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input   Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
-     *                     Sum will have the same number of dimensions as input.
-     * @param[out] output  Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                     Output will have the same number of dimensions as input.
-     * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in]  epsilon Lower bound value for the normalization.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  sum             Sum values tensor. Data types supported: same as @p input.
-     *                             Sum will have the same number of dimensions as input.
-     * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                             Output will have the same number of dimensions as input.
-     * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in]  epsilon         Lower bound value for the normalization.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
-     *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
-     *                    Sum will have the same number of dimensions as input.
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: Same as @p input.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in] axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in] epsilon Lower bound value for the normalization.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_sum;
-    ICLTensor       *_output;
-    unsigned int     _actual_axis;
-    float            _epsilon;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h b/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
deleted file mode 100644
index f94602c381..0000000000
--- a/arm_compute/core/CL/kernels/CLLKTrackerKernel.h
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLKTRACKERKERNEL_H
-#define ARM_COMPUTE_CLLKTRACKERKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Internal keypoint structure for Lucas-Kanade Optical Flow */
-struct CLLKInternalKeypoint
-{
-    float x{ 0.f };               /**< x coordinate of the keypoint */
-    float y{ 0.f };               /**< y coordinate of the keypoint */
-    float tracking_status{ 0.f }; /**< the tracking status of the keypoint */
-    float dummy{ 0.f };           /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
-};
-
-/** Structure for storing Spatial Gradient Matrix and the minimum eigenvalue for each keypoint */
-struct CLCoefficientTable
-{
-    float A11;     /**< iA11 * FLT_SCALE */
-    float A12;     /**< iA11 * FLT_SCALE */
-    float A22;     /**< iA11 * FLT_SCALE */
-    float min_eig; /**< Minimum eigenvalue */
-};
-
-/** Structure for storing ival, ixval and iyval for each point inside the window */
-struct CLOldValue
-{
-    int16_t ival;  /**< ival extracts from old image */
-    int16_t ixval; /**< ixval extracts from scharr Gx image */
-    int16_t iyval; /**< iyval extracts from scharr Gy image */
-    int16_t dummy; /**< Dummy field, to make sure the data structure 128-bit align, so that GPU can use vload4 */
-};
-
-/** Interface for OpenCL Array of Internal Key Points. */
-using ICLLKInternalKeypointArray = ICLArray<CLLKInternalKeypoint>;
-/** Interface for OpenCL Array of Coefficient Tables. */
-using ICLCoefficientTableArray = ICLArray<CLCoefficientTable>;
-/** Interface for OpenCL Array of Old Values. */
-using ICLOldValArray = ICLArray<CLOldValue>;
-
-/** Interface to run the initialization step of LKTracker */
-class CLLKTrackerInitKernel : public ICLKernel
-{
-public:
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
-     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
-     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
-     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]  level                The pyramid level
-     * @param[in]  num_levels           The number of pyramid levels
-     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  compile_context      The compile context to be used.
-     * @param[in]  old_points           Pointer to the @ref ICLKeyPointArray storing old key points
-     * @param[in]  new_points_estimates Pointer to the @ref ICLKeyPointArray storing new estimates key points
-     * @param[out] old_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint old points
-     * @param[out] new_points_internal  Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[in]  use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]  level                The pyramid level
-     * @param[in]  num_levels           The number of pyramid levels
-     * @param[in]  pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const CLCompileContext &compile_context, const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   bool use_initial_estimate, size_t level, size_t num_levels, float pyramid_scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the finalize step of LKTracker, where it truncates the coordinates stored in new_points array */
-class CLLKTrackerFinalizeKernel : public ICLKernel
-{
-public:
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
-     */
-    void configure(ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]  compile_context     The compile context to be used.
-     * @param[in]  new_points_internal Pointer to the array of internal @ref CLLKInternalKeypoint new points
-     * @param[out] new_points          Pointer to the @ref ICLKeyPointArray storing new key points
-     */
-    void configure(const CLCompileContext &compile_context, ICLLKInternalKeypointArray *new_points_internal, ICLKeyPointArray *new_points);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-};
-
-/** Interface to run the first stage of LKTracker, where A11, A12, A22, min_eig, ival, ixval and iyval are computed */
-class CLLKTrackerStage0Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLKTrackerStage0Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage0Kernel(const CLLKTrackerStage0Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage0Kernel &operator=(const CLLKTrackerStage0Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage0Kernel(CLLKTrackerStage0Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage0Kernel &operator=(CLLKTrackerStage0Kernel &&) = default;
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
-     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
-     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
-     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
-     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[out]     old_ival            Pointer to the array holding internal values
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   size_t window_dimension, size_t level);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      compile_context     The compile context to be used.
-     * @param[in]      old_input           Pointer to the input old tensor. Data types supported: U8
-     * @param[in]      old_scharr_gx       Pointer to the input scharr X tensor. Data types supported: S16
-     * @param[in]      old_scharr_gy       Pointer to the input scharr Y tensor. Data types supported: S16
-     * @param[in]      old_points_internal Pointer to the array of CLLKInternalKeypoint old points
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint new points
-     * @param[out]     coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[out]     old_ival            Pointer to the array holding internal values
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *old_input, const ICLTensor *old_scharr_gx, const ICLTensor *old_scharr_gy,
-                   ICLLKInternalKeypointArray *old_points_internal, ICLLKInternalKeypointArray *new_points_internal,
-                   ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   size_t window_dimension, size_t level);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_old_input;
-    const ICLTensor *_old_scharr_gx;
-    const ICLTensor *_old_scharr_gy;
-};
-
-/** Interface to run the second stage of LKTracker, where the motion vectors of the given points are computed */
-class CLLKTrackerStage1Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLKTrackerStage1Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage1Kernel(const CLLKTrackerStage1Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLKTrackerStage1Kernel &operator=(const CLLKTrackerStage1Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage1Kernel(CLLKTrackerStage1Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLKTrackerStage1Kernel &operator=(CLLKTrackerStage1Kernel &&) = default;
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
-     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[in]      old_ival            Pointer to the array holding internal values
-     * @param[in]      termination         The criteria to terminate the search of each keypoint.
-     * @param[in]      epsilon             The error for terminating the algorithm
-     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      compile_context     The compile context to be used.
-     * @param[in]      new_input           Pointer to the input new tensor. Data types supported: U8
-     * @param[in, out] new_points_internal Pointer to the array of CLLKInternalKeypoint for new points
-     * @param[in]      coeff_table         Pointer to the array holding the Spatial Gradient coefficients
-     * @param[in]      old_ival            Pointer to the array holding internal values
-     * @param[in]      termination         The criteria to terminate the search of each keypoint.
-     * @param[in]      epsilon             The error for terminating the algorithm
-     * @param[in]      num_iterations      The maximum number of iterations before terminating the algorithm
-     * @param[in]      window_dimension    The size of the window on which to perform the algorithm
-     * @param[in]      level               The pyramid level
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *new_input, ICLLKInternalKeypointArray *new_points_internal, ICLCoefficientTableArray *coeff_table, ICLOldValArray *old_ival,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, size_t level);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_new_input;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLLKTRACKERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
deleted file mode 100644
index e68160f96d..0000000000
--- a/arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply each row of first tensor with low 2 dimensions of second tensor.
- *
- * @attention The second input tensor must have at least 2 dimensions (matrix)
- *
- */
-class CLLocallyConnectedMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLocallyConnectedMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLocallyConnectedMatrixMultiplyKernel(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLocallyConnectedMatrixMultiplyKernel &operator=(const CLLocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLocallyConnectedMatrixMultiplyKernel(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLocallyConnectedMatrixMultiplyKernel &operator=(CLLocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0 First input tensor. Data types supported: F32
-     * @param[in]  input1 Second input tensor. Data type supported: same as @p input0
-     * @param[out] output Output tensor to store the result. Data type supported: same as @p input0
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          First input tensor. Data types supported: F32
-     * @param[in]  input1          Second input tensor. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result. Data type supported: same as @p input0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLocallyConnectedMatrixMultiplyKernel
-     *
-     * @param[in] input0 First input tensor info. Data types supported: F32
-     * @param[in] input1 Second input tensor info. Data type supported: same as @p input0
-     * @param[in] output Output tensor info. Data type supported: same as @p input0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLLOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h b/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
deleted file mode 100644
index e0de3e7636..0000000000
--- a/arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Template interface for the kernel to compute magnitude and phase.
- *
- */
-class CLMagnitudePhaseKernel : public ICLKernel
-{
-public:
-    /** Default constructor. */
-    CLMagnitudePhaseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel(const CLMagnitudePhaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMagnitudePhaseKernel &operator=(const CLMagnitudePhaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel(CLMagnitudePhaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMagnitudePhaseKernel &operator=(CLMagnitudePhaseKernel &&) = default;
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  gx         The input gradient X tensor. Data types supported: S16.
-     * @param[in]  gy         The input gradient Y tensor. Data types supported: S16.
-     * @param[out] magnitude  (Optional) The output tensor - Magnitude. Data types supported: S16.
-     * @param[out] phase      (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type   (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of output1 or output2 must be set.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  gx              The input gradient X tensor. Data types supported: S16.
-     * @param[in]  gy              The input gradient Y tensor. Data types supported: S16.
-     * @param[out] magnitude       (Optional) The output tensor - Magnitude. Data types supported: S16.
-     * @param[out] phase           (Optional) The output tensor - Phase. Data types supported: U8.
-     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
-     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *gx, const ICLTensor *gy, ICLTensor *magnitude, ICLTensor *phase,
-                   MagnitudeType mag_type = MagnitudeType::L2NORM, PhaseType phase_type = PhaseType::SIGNED);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_gx;        /**< Input gradient X. */
-    const ICLTensor *_gy;        /**< Input gradient Y. */
-    ICLTensor       *_magnitude; /**< Output - Magnitude. */
-    ICLTensor       *_phase;     /**< Output - Phase. */
-    bool             _run_mag;   /**< Calculate magnitude ? */
-    bool             _run_phase; /**< Calculate phase ? */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMAGNITUDEPHASEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
deleted file mode 100644
index 96b4c4ea60..0000000000
--- a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_CLMEANSTDDEVKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace cl
-{
-class Buffer;
-}
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class CLMeanStdDevKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel(const CLMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevKernel &operator=(const CLMeanStdDevKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel(CLMeanStdDevKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevKernel &operator=(CLMeanStdDevKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Input image. Data types supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values (Buffer size: 1 cl_ulong).
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel.
-     *
-     * @param[in] input              Input image info. Data types supported: U8.
-     * @param[in] mean               Input average pixel value.
-     * @param[in] global_sum         Keeps global sum of pixel values.
-     * @param[in] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const ICLImage *_input;
-    float          *_mean;
-    float          *_stddev;
-    cl::Buffer     *_global_sum;
-    cl::Buffer     *_global_sum_squared;
-    BorderSize      _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMEANSTDDEVKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
deleted file mode 100644
index ff0c96e168..0000000000
--- a/arm_compute/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
-class CLMeanStdDevNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMeanStdDevNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevNormalizationKernel(const CLMeanStdDevNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDevNormalizationKernel &operator=(const CLMeanStdDevNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevNormalizationKernel(CLMeanStdDevNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDevNormalizationKernel &operator=(CLMeanStdDevNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLMeanStdDevNormalizationKernel() = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
-     *
-     * @param[in, out] input   Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
-     *                         this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[out]     output  (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in]      epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     */
-    void configure(ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
-     *                                 this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
-     *
-     * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
-     *                    this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[in] output  (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h b/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
deleted file mode 100644
index c68ab07781..0000000000
--- a/arm_compute/core/CL/kernels/CLMedian3x3Kernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-#define ARM_COMPUTE_CLMEDIAN3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the median 3x3 filter kernel.
- *
- */
-class CLMedian3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            An input tensor. Data types supported: U8
-     * @param[out] output           The output tensor. Data types supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEDIAN3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMemsetKernel.h b/arm_compute/core/CL/kernels/CLMemsetKernel.h
deleted file mode 100644
index 430bc1d4f2..0000000000
--- a/arm_compute/core/CL/kernels/CLMemsetKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
-#define ARM_COMPUTE_CLMEMSETKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for filling the planes of a tensor */
-class CLMemsetKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMemsetKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMemsetKernel(const CLMemsetKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMemsetKernel(CLMemsetKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
-    /** Default destructor */
-    ~CLMemsetKernel() = default;
-
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
-     * @param[in]     constant_value The value used to fill the planes of the tensor
-     * @param[in]     window         Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Initialise the kernel's tensor and filling value
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
-     * @param[in]     constant_value  The value used to fill the planes of the tensor
-     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMemsetKernel
-     *
-     * @param[in] tensor         Source tensor info. Data types supported: All.
-     * @param[in] constant_value The value used to fill the planes of the tensor
-     * @param[in] window         Window to be used in case setting only part of a tensor. Default is nullptr.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_tensor;
-    Window     _full_window;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
deleted file mode 100644
index 5f9685f303..0000000000
--- a/arm_compute/core/CL/kernels/CLMinMaxLayerKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor.
- */
-class CLMinMaxLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel(const CLMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLayerKernel &operator=(const CLMinMaxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel(CLMinMaxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLayerKernel &operator=(CLMinMaxLayerKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.Data types supported: F32.
-     * @param[out] output          Output tensor with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    /** Resets global minimum and maximum
-     *
-     * @param[in,out] queue Command queue on which to map and unmap the min_max tensor
-     */
-    void reset(cl::CommandQueue &queue);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h b/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
deleted file mode 100644
index afb134fa59..0000000000
--- a/arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Interface for the kernel to perform min max search on an image.
- */
-class CLMinMaxKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel(const CLMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxKernel &operator=(const CLMinMaxKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel(CLMinMaxKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxKernel &operator=(CLMinMaxKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input   Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;               /**< Input image. */
-    cl::Buffer      *_min_max;             /**< Minimum/maximum value. */
-    std::array<int, 2> _data_type_max_min; /**< Maximum and minimum data type value respectively. */
-};
-
-/** Interface for the kernel to find min max locations of an image.
- */
-class CLMinMaxLocationKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel(const CLMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocationKernel &operator=(const CLMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel(CLMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocationKernel &operator=(CLMinMaxLocationKernel &&) = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  input         Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max       Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc       (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc       (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
-     * @param[out] min_max         Buffer of 2 elements to store the min value at position 0 and the max value at position 1. Data type supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_max_count   Buffer of 2 elements to store the min value occurrences at position 0 and the max value occurrences at position 1. Data type supported: S32
-     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, cl::Buffer *min_max, cl::Buffer *min_max_count,
-                   ICLCoordinates2DArray *min_loc = nullptr, ICLCoordinates2DArray *max_loc = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLImage *_input;         /**< Input image. */
-    cl::Buffer     *_min_max_count; /**< Minimum/maximum value occurrences. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLMINMAXLOCATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h b/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
deleted file mode 100644
index 1f337356e9..0000000000
--- a/arm_compute/core/CL/kernels/CLNonLinearFilterKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class CLNonLinearFilterKernel : public ICLSimple2DKernel
-{
-public:
-    /** Default constructor */
-    CLNonLinearFilterKernel();
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8
-     * @param[out] output           Destination tensor. Data types supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function,
-                   unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size; /**< Border size */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNONLINEARFILTERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h b/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index a256bc798d..0000000000
--- a/arm_compute/core/CL/kernels/CLNonMaximaSuppression3x3Kernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using OpenCL
- *
- * @note Used by @ref CLFastCorners and @ref CLHarrisCorners
- */
-class CLNonMaximaSuppression3x3Kernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, bool border_undefined);
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8, F32. (Must be the same as the output tensor)
-     * @param[out] output           Destination tensor. Data types supported: U8, F32. (Must be the same as the input tensor)
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
deleted file mode 100644
index 2511818ef2..0000000000
--- a/arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the normalization layer kernel.
- */
-class CLNormalizationLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizationLayerKernel(const CLNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizationLayerKernel &operator=(const CLNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLNormalizationLayerKernel(CLNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLNormalizationLayerKernel &operator=(CLNormalizationLayerKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
-     *                       Data layouts supported: same as @p input.
-     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                             and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[out] output          Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
-     *                             Data layouts supported: same as @p input.
-     * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
-     *
-     * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                      and an optional 4th dimension for batch of inputs. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] output    Destination tensor. Output will have the same number of dimensions as input. Data types supported: same as @p input.
-     *                      Data layouts supported: same as @p input.
-     * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    BorderSize       _border_size;
-    bool             _is_norm_across_width;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
deleted file mode 100644
index d247e1fddc..0000000000
--- a/arm_compute/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
-#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the NormalizePlanarYUV layer kernel. */
-class CLNormalizePlanarYUVLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLNormalizePlanarYUVLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizePlanarYUVLayerKernel(const CLNormalizePlanarYUVLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLNormalizePlanarYUVLayerKernel &operator=(const CLNormalizePlanarYUVLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLNormalizePlanarYUVLayerKernel(CLNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLNormalizePlanarYUVLayerKernel &operator=(CLNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLNormalizePlanarYUVLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean            Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
-     *                             Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
-     *
-     * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor info. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_mean;
-    const ICLTensor *_std;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPadLayerKernel.h b/arm_compute/core/CL/kernels/CLPadLayerKernel.h
deleted file mode 100644
index 166c202335..0000000000
--- a/arm_compute/core/CL/kernels/CLPadLayerKernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPADLAYERKERNEL_H
-#define ARM_COMPUTE_CLPADLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the PadLayer function. */
-class CLPadLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLPadLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPadLayerKernel(const CLPadLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPadLayerKernel &operator=(const CLPadLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPadLayerKernel(CLPadLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPadLayerKernel &operator=(CLPadLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLPadLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input          Source tensor. Data types supported: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
-     * @param[out] output         Output tensor. Data type supported: same as @p input
-     * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                            specifies the front and the end padding in the i-th dimension.
-     * @param[in]  constant_value (Optional) Constant value to be used for the padding.
-     * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
-    /** Set the input and output tensor.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: same as @p input
-     * @param[in]  padding         The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                             specifies the front and the end padding in the i-th dimension.
-     * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
-     * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
-     *
-     * @param[in] input          Source tensor info. Data types supported: U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
-     * @param[in] output         Output tensor info. Data type supported: same as @p input
-     * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                           specifies the front and the end padding in the i-th dimension.
-     * @param[in] constant_value (Optional) Constant value to be used for the padding.
-     * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
-     *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    int              _input_start_x;
-    int              _input_start_y;
-    bool             _4d_enabled;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPADLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPermuteKernel.h b/arm_compute/core/CL/kernels/CLPermuteKernel.h
deleted file mode 100644
index 1a9240ef6b..0000000000
--- a/arm_compute/core/CL/kernels/CLPermuteKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPERMUTEKERNEL_H
-#define ARM_COMPUTE_CLPERMUTEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform tensor permutation.
- *
- * Permutes given a permutation vector
- */
-class CLPermuteKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLPermuteKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPermuteKernel(const CLPermuteKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPermuteKernel &operator=(const CLPermuteKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPermuteKernel(CLPermuteKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPermuteKernel &operator=(CLPermuteKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] input  The input tensor to permute. Data types supported: All.
-     * @param[in] output The output tensor. Data types supported: Same as @p input
-     * @param[in] perm   Permutation vector
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
-    /** Set the input and output of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           The input tensor to permute. Data types supported: All.
-     * @param[in] output          The output tensor. Data types supported: Same as @p input
-     * @param[in] perm            Permutation vector
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPermuteKernel
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] input  First tensor input info. Data types supported: All.
-     * @param[in] output Output tensor info. Data types supported: same as @p input.
-     * @param[in] perm   Permutation vector
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor        *_output;
-    PermutationVector _perm;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLPERMUTEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h b/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 52a09d9a49..0000000000
--- a/arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the pixelwise multiplication kernel. */
-class CLPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
-    /** Default constructor.*/
-    CLPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPixelWiseMultiplicationKernel(const CLPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPixelWiseMultiplicationKernel &operator=(const CLPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPixelWiseMultiplicationKernel(CLPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPixelWiseMultiplicationKernel &operator=(CLPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported:
-     *                             - U8, only if both input are U8
-     *                             - QASYMM8, only if both inputs are QASYMM8
-     *                             - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                             - S16
-     *                             - QSYMM16, only if both inputs are QSYMM16
-     *                             - S32, only if both inputs are QSYMM16
-     *                             - F16
-     *                             - F32
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info, Data types supported:
-     *                            - U8, only if both input are U8
-     *                            - QASYMM8, only if both inputs are QASYMM8
-     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                            - S16
-     *                            - QSYMM16, only if both inputs are QSYMM16
-     *                            - S32, only if both inputs are QSYMM16
-     *                            - F16
-     *                            - F32
-     * @param[in] scale           Scale to apply after multiplication.
-     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-     * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class CLComplexPixelWiseMultiplicationKernel : public ICLKernel
-{
-public:
-    /** Default constructor.*/
-    CLComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComplexPixelWiseMultiplicationKernel(const CLComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLComplexPixelWiseMultiplicationKernel &operator=(const CLComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLComplexPixelWiseMultiplicationKernel(CLComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLComplexPixelWiseMultiplicationKernel &operator=(CLComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1   An input tensor. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2   An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output   The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          An input tensor. Data types supported: F32. Number of channels supported: 2.
-     * @param[in]  input2          An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
-     * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
deleted file mode 100644
index 395750440c..0000000000
--- a/arm_compute/core/CL/kernels/CLPoolingLayerKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the pooling layer kernel */
-class CLPoolingLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPoolingLayerKernel(const CLPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPoolingLayerKernel &operator=(const CLPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLPoolingLayerKernel(CLPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLPoolingLayerKernel &operator=(CLPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
-    /** Set the input and output tensors.
-     *
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    ICLTensor       *_indices;
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    BorderSize       _border_size;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h b/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
deleted file mode 100644
index 5fd27d9233..0000000000
--- a/arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
-#define ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the PriorBox layer kernel. */
-class CLPriorBoxLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLPriorBoxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPriorBoxLayerKernel(const CLPriorBoxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLPriorBoxLayerKernel &operator=(const CLPriorBoxLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLPriorBoxLayerKernel(CLPriorBoxLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLPriorBoxLayerKernel &operator=(CLPriorBoxLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLPriorBoxLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input1        First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input2        Second source tensor. Data types and layouts supported: same as @p input1
-     * @param[out] output        Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
-     * @param[in]  info          Prior box layer info.
-     * @param[in]  min           Minimum prior box values
-     * @param[in]  max           Maximum prior box values
-     * @param[in]  aspect_ratios Aspect ratio values
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input2          Second source tensor. Data types and layouts supported: same as @p input1
-     * @param[out] output          Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
-     * @param[in]  info            Prior box layer info.
-     * @param[in]  min             Minimum prior box values
-     * @param[in]  max             Maximum prior box values
-     * @param[in]  aspect_ratios   Aspect ratio values
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
-     *
-     * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
-     * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input1
-     * @param[in] info   Prior box layer info.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor        *_output;
-    PriorBoxLayerInfo _info;
-    int               _num_priors;
-    cl::Buffer       *_min;
-    cl::Buffer       *_max;
-    cl::Buffer       *_aspect_ratios;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLPRIORBOXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
deleted file mode 100644
index 2d4707245f..0000000000
--- a/arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to do layer normalization. */
-class CLQLSTMLayerNormalizationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLQLSTMLayerNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQLSTMLayerNormalizationKernel(const CLQLSTMLayerNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQLSTMLayerNormalizationKernel &operator=(const CLQLSTMLayerNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLQLSTMLayerNormalizationKernel(CLQLSTMLayerNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLQLSTMLayerNormalizationKernel &operator=(CLQLSTMLayerNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~CLQLSTMLayerNormalizationKernel() = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input  Source tensor with 2 dimensions. Data types supported: QSYMM16.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  weight Weight tensor. Data types supported: Same as @p input.
-     * @param[in]  bias   Bias tensor. Data types supported: S32.
-     *
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor with 2 dimensions. Data types supported: QSYMM16.
-     * @param[out] output          Destination tensor. Data type supported: same as @p input
-     * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
-     * @param[in]  bias            Bias tensor. Data types supported: S32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
-     *
-     * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
-     * @param[in] output Destination info tensor. Data type supported: same as @p input
-     * @param[in] weight Weight info tensor. Data types supported: Same as @p input.
-     * @param[in] bias   Bias tensor info. Data types supported: S32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_weight;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLQLSTMLAYERVNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h b/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
deleted file mode 100644
index de30447e17..0000000000
--- a/arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors.
- */
-class CLQuantizationLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLQuantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQuantizationLayerKernel(const CLQuantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLQuantizationLayerKernel &operator=(const CLQuantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLQuantizationLayerKernel(CLQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLQuantizationLayerKernel &operator=(CLQuantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLQuantizationLayerKernel() = default;
-    /** Set the input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output          Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLQuantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] output Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h b/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
deleted file mode 100644
index 30bdbb1844..0000000000
--- a/arm_compute/core/CL/kernels/CLROIAlignLayerKernel.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
-#define ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the RoIAlign kernel.
- */
-class CLROIAlignLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLROIAlignLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIAlignLayerKernel(const CLROIAlignLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIAlignLayerKernel &operator=(const CLROIAlignLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLROIAlignLayerKernel(CLROIAlignLayerKernel &&) = default;
-    /** Default move assignment operator. */
-    CLROIAlignLayerKernel &operator=(CLROIAlignLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLROIAlignLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                             Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
-     *                      otherwise same as @p input
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue);
-
-private:
-    const ICLTensor    *_input;
-    ICLTensor          *_output;
-    const ICLTensor    *_rois;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLROIALIGNLAYERKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h b/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
deleted file mode 100644
index ea70a58188..0000000000
--- a/arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/CL/ICLArray.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the ROI pooling layer kernel */
-class CLROIPoolingLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLROIPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIPoolingLayerKernel(const CLROIPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLROIPoolingLayerKernel &operator=(const CLROIPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLROIPoolingLayerKernel(CLROIPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLROIPoolingLayerKernel &operator=(CLROIPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLROIPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32.
-     * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info       Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor    *_input;
-    const ICLTensor    *_rois;
-    ICLTensor          *_output;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLROIPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLRangeKernel.h b/arm_compute/core/CL/kernels/CLRangeKernel.h
deleted file mode 100644
index fc8db98bf9..0000000000
--- a/arm_compute/core/CL/kernels/CLRangeKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLRANGEKERNEL_H
-#define ARM_COMPUTE_CLRANGEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Kernel class for Range
- *
- * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
- * of 'step' up to but not including 'end'.
- */
-class CLRangeKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRangeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRangeKernel(const CLRangeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRangeKernel &operator=(const CLRangeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRangeKernel(CLRangeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRangeKernel &operator=(CLRangeKernel &&) = default;
-    /** Default destructor */
-    ~CLRangeKernel() = default;
-    /** Initialize the kernel's output tensor, start, end and step of the sequence.
-     *
-     * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  start  The starting value of the sequence.
-     * @param[in]  end    The ending (not including) value of the sequence.
-     * @param[in]  step   The gap between each pair of values in the sequence.
-     */
-    void configure(ICLTensor *output, float start, float end, float step);
-    /** Initialize the kernel's output tensor, start, end and step of the sequence.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[out] output          Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  start           The starting value of the sequence.
-     * @param[in]  end             The ending (not including) value of the sequence.
-     * @param[in]  step            The gap between each pair of values in the sequence.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLRangeKernel
-     *
-     * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] start  The starting value of the sequence.
-     * @param[in] end    The ending (not including) value of the sequence.
-     * @param[in] step   The gap between each pair of values in the sequence.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *output, float start, float end, float step);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    float      _start;  /**< Start of sequence */
-    float      _end;    /**< End of sequence */
-    float      _step;   /**< Increment/step value */
-    ICLTensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLRANGEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
deleted file mode 100644
index 0b0b4ae9b0..0000000000
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
-#define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reduction operation kernel
- */
-class CLReductionOperationKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReductionOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReductionOperationKernel(const CLReductionOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReductionOperationKernel &operator=(const CLReductionOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReductionOperationKernel(CLReductionOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReductionOperationKernel &operator=(CLReductionOperationKernel &&) = default;
-    /** Default destructor */
-    ~CLReductionOperationKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in]  width  (Optional)  In case of x-axis we also need to provide the width of the input image.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
-     *                             Output will have the same number of dimensions as input.
-     * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in]  width           (Optional)  In case of x-axis we also need to provide the width of the input image.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
-     * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
-     *                   Output will have the same number of dimensions as input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0,1,2,3
-     * @param[in] op     Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
-     * @param[in] width  (Optional)  In case of x-axis we also need to provide the width of the input image.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, unsigned int width = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor   *_input;
-    ICLTensor         *_output;
-    unsigned int       _reduction_axis;
-    ReductionOperation _op;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLRemapKernel.h b/arm_compute/core/CL/kernels/CLRemapKernel.h
deleted file mode 100644
index f3d1511905..0000000000
--- a/arm_compute/core/CL/kernels/CLRemapKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAPKERNEL_H
-#define ARM_COMPUTE_CLREMAPKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a remap on a tensor */
-class CLRemapKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLRemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel(const CLRemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLRemapKernel &operator=(const CLRemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel(CLRemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLRemapKernel &operator=(CLRemapKernel &&) = default;
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
-     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy           The interpolation type.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[in]  map_x            Map for X coordinates. Data types supported: F32.
-     * @param[in]  map_y            Map for Y coordinates. Data types supported: F32.
-     * @param[out] output           Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy           The interpolation type.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output, InterpolationPolicy policy, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_map_x;
-    const ICLTensor *_map_y;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREMAPKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h b/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
deleted file mode 100644
index 9c064858af..0000000000
--- a/arm_compute/core/CL/kernels/CLReorgLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREORGLAYERKERNEL_H
-#define ARM_COMPUTE_CLREORGLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a reorg layer */
-class CLReorgLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReorgLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLReorgLayerKernel(const CLReorgLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLReorgLayerKernel &operator=(const CLReorgLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReorgLayerKernel(CLReorgLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReorgLayerKernel &operator=(CLReorgLayerKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[out] output Destination tensor with tensor shape:
-     *                    [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
-     *                    the same number of input elements. Data types supported: same as @p input.
-     * @param[in]  stride Stride value to use for reorganizing the values in the output tensor.
-     *                    It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t stride);
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
-     * @param[out] output          Destination tensor with tensor shape:
-     *                             [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
-     *                             the same number of input elements. Data types supported: same as @p input.
-     * @param[in]  stride          Stride value to use for reorganizing the values in the output tensor.
-     *                             It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReorgLayerKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: All.
-     * @param[in] output Destination tensor with tensor shape:
-     *                   [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
-     *                   the same number of input elements. Data types supported: same as @p input. Data types supported: same as @p input.
-     * @param[in] stride Stride value to use for reorganizing the values in the output tensor
-     *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREORGLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
deleted file mode 100644
index 3ea74114d0..0000000000
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
-#define ARM_COMPUTE_CLRESHAPELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to perform tensor reshaping */
-class CLReshapeLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReshapeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel(const CLReshapeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel &operator=(const CLReshapeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel(CLReshapeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel &operator=(CLReshapeLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLReshapeLayerKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All.
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: All.
-     * @param[out] output          Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: All
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLRESHAPELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLReverseKernel.h b/arm_compute/core/CL/kernels/CLReverseKernel.h
deleted file mode 100644
index e8f4507969..0000000000
--- a/arm_compute/core/CL/kernels/CLReverseKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREVERSEKERNEL_H
-#define ARM_COMPUTE_CLREVERSEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the reverse kernel */
-class CLReverseKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReverseKernel(const CLReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReverseKernel &operator=(const CLReverseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReverseKernel(CLReverseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReverseKernel &operator=(CLReverseKernel &&) = default;
-    /** Default destructor */
-    ~CLReverseKernel() = default;
-    /** Initialise the kernel's inputis and output
-     *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
-    /** Initialise the kernel's inputis and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_axis;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLREVERSEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLScaleKernel.h b/arm_compute/core/CL/kernels/CLScaleKernel.h
deleted file mode 100644
index 328578d88c..0000000000
--- a/arm_compute/core/CL/kernels/CLScaleKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCALEKERNEL_H
-#define ARM_COMPUTE_CLSCALEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the scale kernel */
-class CLScaleKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy          Interpolation type to use
-     * @param[in]  border_mode     Selected border mode.
-     * @param[in]  sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool align_corners = false);
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: Same as @p input
-     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy          Interpolation type to use
-     * @param[in]  border_mode     Selected border mode.
-     * @param[in]  sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]  align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode,
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool align_corners = false);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLScaleKernel
-     *
-     * @param[in] input           Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output          Destination tensor info. Data types supported: Same as @p input
-     *                            All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy          Interpolation type to use
-     * @param[in] border_mode     Selected border mode.
-     * @param[in] sampling_policy (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] align_corners   (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, SamplingPolicy sampling_policy = SamplingPolicy::CENTER,
-                           bool align_corners = false);
-    /** Input tensor accessor.
-     *
-     * @return Pointer to input tensor.
-     */
-    const ICLTensor *input() const;
-    /** Output tensor accessor.
-     *
-     * @return Pointer to output tensor.
-     */
-    const ICLTensor *output() const;
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-public:
-    InterpolationPolicy _interpolationPolicy = InterpolationPolicy::BILINEAR;
-    DataLayout          _data_layout         = DataLayout::UNKNOWN;
-    bool                _align_corners       = false;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCALEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h b/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
deleted file mode 100644
index 209a150a67..0000000000
--- a/arm_compute/core/CL/kernels/CLScharr3x3Kernel.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-#define ARM_COMPUTE_CLSCHARR3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -3 & 0 & +3\\
- *      -10& 0 & +10\\
- *      -3 & 0 & +3
- *      \end{vmatrix}
- * @f]
- * @f[
- *      \mathbf{G}_y=\begin{vmatrix}
- *      -3 & -10 & -3\\
- *       0 & 0 & 0\\
- *      +3 & +10 & +3
- *      \end{vmatrix}
- * @f]
- */
-class CLScharr3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel(const CLScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLScharr3x3Kernel &operator=(const CLScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel(CLScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLScharr3x3Kernel &operator=(CLScharr3x3Kernel &&) = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    bool             _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool             _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ICLTensor *_input;        /**< Input image */
-    ICLTensor       *_output_x;     /**< Output image for scharr X */
-    ICLTensor       *_output_y;     /**< Output image for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSCHARR3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSelectKernel.h b/arm_compute/core/CL/kernels/CLSelectKernel.h
deleted file mode 100644
index 5cbd985cda..0000000000
--- a/arm_compute/core/CL/kernels/CLSelectKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSELECTKERNEL_H
-#define ARM_COMPUTE_CLSELECTKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** OpenCL interface for executing the select kernel
- *
- * Select is computed by:
- * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
- **/
-class CLSelectKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLSelectKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSelectKernel(const CLSelectKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSelectKernel &operator=(const CLSelectKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSelectKernel(CLSelectKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSelectKernel &operator=(CLSelectKernel &&) = default;
-    /** Default destructor */
-    ~CLSelectKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  c      Condition input tensor. Data types supported: U8.
-     * @param[in]  x      First input tensor. Data types supported: All.
-     * @param[out] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in]  output Output tensor. Data types supported: Same as @p x.
-     */
-    void configure(const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  c               Condition input tensor. Data types supported: U8.
-     * @param[in]  x               First input tensor. Data types supported: All.
-     * @param[out] y               Second input tensor. Data types supported: Same as @p x
-     * @param[in]  output          Output tensor. Data types supported: Same as @p x.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_c;             /**< Condition tensor */
-    const ICLTensor *_x;             /**< Source tensor 1 */
-    const ICLTensor *_y;             /**< Source tensor 2 */
-    ICLTensor       *_output;        /**< Destination tensor */
-    bool             _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWHEREKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h b/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
deleted file mode 100644
index 4240fe80b3..0000000000
--- a/arm_compute/core/CL/kernels/CLSobel3x3Kernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-#define ARM_COMPUTE_CLSOBEL3X3KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 Sobel filter on a tensor. */
-class CLSobel3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel(const CLSobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel3x3Kernel &operator=(const CLSobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel(CLSobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel3x3Kernel &operator=(CLSobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~CLSobel3x3Kernel() = default;
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< Output tensor for Sobel X */
-    ICLTensor       *_output_y;    /**< Output tensor for Sobel Y */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL3X3KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h b/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
deleted file mode 100644
index ef30f0ec93..0000000000
--- a/arm_compute/core/CL/kernels/CLSobel5x5Kernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-#define ARM_COMPUTE_CLSOBEL5X5KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel(const CLSobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5HorKernel &operator=(const CLSobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel(CLSobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5HorKernel &operator=(CLSobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel filter on a tensor. */
-class CLSobel5x5VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel(const CLSobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel5x5VertKernel &operator=(const CLSobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel(CLSobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel5x5VertKernel &operator=(CLSobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S16.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S16.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL5X5KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h b/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
deleted file mode 100644
index 4eda5a40d4..0000000000
--- a/arm_compute/core/CL/kernels/CLSobel7x7Kernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-#define ARM_COMPUTE_CLSOBEL7X7KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7HorKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel(const CLSobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7HorKernel &operator=(const CLSobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel(CLSobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7HorKernel &operator=(CLSobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data types supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input;       /**< Input tensor */
-    ICLTensor       *_output_x;    /**< X output of horizontal pass */
-    ICLTensor       *_output_y;    /**< Y output of horizontal pass */
-    bool             _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool             _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    BorderSize       _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel filter on a tensor. */
-class CLSobel7x7VertKernel : public ICLKernel
-{
-public:
-    /** Default constructor: initialize all the pointers to nullptr and parameters to zero. */
-    CLSobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel(const CLSobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSobel7x7VertKernel &operator=(const CLSobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel(CLSobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSobel7x7VertKernel &operator=(CLSobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~CLSobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set and the corresponding input.
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input_x          (Optional) Input for X (X output of horizontal pass). Data types supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of horizontal pass). Data types supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient, Data types supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient, Data types supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_x, const ICLTensor *input_y, ICLTensor *output_x, ICLTensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    const ICLTensor *_input_x;     /**< X input (X output of the horizontal pass) */
-    const ICLTensor *_input_y;     /**< Y input (Y output of the horizontal pass) */
-    ICLTensor       *_output_x;    /**< X output of sobel */
-    ICLTensor       *_output_y;    /**< Y output of sobel */
-    bool             _run_sobel_x; /**< Do we need to run sobel X? */
-    bool             _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOBEL7X7KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h b/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
deleted file mode 100644
index b174f493b5..0000000000
--- a/arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
-#include "arm_compute/core/KernelDescriptors.h"
-
-#include <tuple>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the identifying the max value of 1D Logits */
-class CLLogits1DMaxKernel : public ICLSimple3DKernel
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[out] output          Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Interface for shifting, exponentiating and summing the logits */
-class CLLogits1DShiftExpSumKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLogits1DShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DShiftExpSumKernel(const CLLogits1DShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DShiftExpSumKernel &operator=(const CLLogits1DShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DShiftExpSumKernel(CLLogits1DShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DShiftExpSumKernel &operator=(CLLogits1DShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.0
-     */
-    void configure(const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in]  max             Max values tensor. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[out] sum             Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.0
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *max, ICLTensor *output, ICLTensor *sum, float beta = 1.0f);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: S32 for QASYMM8 @p input, or same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_max;
-    ICLTensor       *_output;
-    ICLTensor       *_sum;
-};
-
-/** Interface for max, shifting, exponentiating and summing the logits */
-class CLLogits1DMaxShiftExpSumKernel : public ICLKernel
-{
-public:
-    /** Info for whether a parallel reduction will be run and the vector size of the execution. */
-    using ParallelReductionInfo = std::tuple<bool, unsigned int>;
-
-public:
-    /** Default constructor */
-    CLLogits1DMaxShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DMaxShiftExpSumKernel(const CLLogits1DMaxShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DMaxShiftExpSumKernel &operator=(const CLLogits1DMaxShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DMaxShiftExpSumKernel(CLLogits1DMaxShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DMaxShiftExpSumKernel &operator=(CLLogits1DMaxShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]     input  Source tensor. Data types supported: F16/F32
-     * @param[in,out] max    Max values tensor. Data types supported: same as @p input
-     * @param[out]    output Destination tensor. Data types supported: same as @p input
-     * @param[out]    sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]     info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: F16/F32
-     * @param[in,out] max             Max values tensor. Data types supported: same as @p input
-     * @param[out]    output          Destination tensor. Data types supported: same as @p input
-     * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p input
-     * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *max, ICLTensor *output, ICLTensor *sum, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DMaxShiftExpSumKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: F16/F32
-     * @param[in] max    Max values tensor. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     * @param[in] sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max, const ITensorInfo *output, const ITensorInfo *sum);
-    /** Checks if the given size is eligible for parallel reduction
-     *
-     * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
-     * @note  Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4.
-     *
-     * @param[in] size Size to check
-     *
-     * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run,
-     *         while the second element is the vector size of the execution.
-     */
-    static ParallelReductionInfo is_parallel_reduction(size_t size);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_max;
-    ICLTensor       *_output;
-    ICLTensor       *_sum;
-
-private:
-    static const unsigned int _grid_size;
-    static const unsigned int _serial_vector_size;
-    static const unsigned int _parallel_vector_size;
-};
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class CLLogits1DNormKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLLogits1DNormKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DNormKernel(const CLLogits1DNormKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLogits1DNormKernel &operator=(const CLLogits1DNormKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLLogits1DNormKernel(CLLogits1DNormKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLLogits1DNormKernel &operator=(CLLogits1DNormKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: S32/F16/F32
-     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in]  info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: S32/F16/F32
-     * @param[in]  sum             Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output          Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, const SoftmaxKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLogits1DNormKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: S32/F16/F32
-     * @param[in] sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[in] output Destination tensor. Data types supported: QASYMM8 for S32 @p input, or same as @p input
-     * @param[in] info   Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, const SoftmaxKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_sum;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLSOFTMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
deleted file mode 100644
index 799b7b16c3..0000000000
--- a/arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
-#define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the space to batch kernel */
-class CLSpaceToBatchLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLSpaceToBatchLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToBatchLayerKernel(const CLSpaceToBatchLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToBatchLayerKernel &operator=(const CLSpaceToBatchLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSpaceToBatchLayerKernel(CLSpaceToBatchLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSpaceToBatchLayerKernel &operator=(CLSpaceToBatchLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLSpaceToBatchLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings        2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
-    /** Initialise the kernel's input and output. (Static block shape and paddings)
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
-    /** Initialise the kernel's input and output. (Static block shape and paddings)
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x   Block shape x value.
-     * @param[in]  block_shape_y   Block shape y value.
-     * @param[in]  padding_left    The left padding of the output tensor.
-     * @param[in]  padding_right   The right padding of the output tensor.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    const ICLTensor *_block_shape; /**< Block shape tensor */
-    const ICLTensor *_paddings;    /**< Paddings tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
deleted file mode 100644
index f2371e7d87..0000000000
--- a/arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
-#define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the space to depth kernel */
-class CLSpaceToDepthLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLSpaceToDepthLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToDepthLayerKernel(const CLSpaceToDepthLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLSpaceToDepthLayerKernel &operator=(const CLSpaceToDepthLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLSpaceToDepthLayerKernel(CLSpaceToDepthLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLSpaceToDepthLayerKernel &operator=(CLSpaceToDepthLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLSpaceToDepthLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape value.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output          Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape     Block shape value.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;       /**< Source tensor */
-    ICLTensor       *_output;      /**< Destination tensor */
-    int32_t          _block_shape; /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLStackLayerKernel.h b/arm_compute/core/CL/kernels/CLStackLayerKernel.h
deleted file mode 100644
index e11c0a30d6..0000000000
--- a/arm_compute/core/CL/kernels/CLStackLayerKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLSTACKLAYERKERNEL_H
-#define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
-class CLStackLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLStackLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStackLayerKernel(const CLStackLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStackLayerKernel &operator=(const CLStackLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLStackLayerKernel(CLStackLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLStackLayerKernel &operator=(CLStackLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLStackLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in]  input       Input tensor. Data types supported: All.
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[in]  axis            The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input       Index of the input tensor in the list of tensors to stack.
-     *                             All tensors in the list must have the same shape
-     * @param[in]  num_tensors     Number of tensors to stack
-     * @param[out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in] input       Input tensor info. Data types supported: All.
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSTACKLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h b/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
deleted file mode 100644
index ebe1b38878..0000000000
--- a/arm_compute/core/CL/kernels/CLStridedSliceKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
-#define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-// Forward declarations
-class ICLTensor;
-
-/** Interface for the kernel to perform tensor strided slicing */
-class CLStridedSliceKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLStridedSliceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel(const CLStridedSliceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLStridedSliceKernel &operator=(const CLStridedSliceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel(CLStridedSliceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLStridedSliceKernel &operator=(CLStridedSliceKernel &&) = default;
-    /** Default destructor */
-    ~CLStridedSliceKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  compile_context  The compile context to be used.
-     * @param[in]  input            Source tensor. Data type supported: All.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in] input            Source tensor. Data type supported: All.
-     * @param[in] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLTableLookupKernel.h b/arm_compute/core/CL/kernels/CLTableLookupKernel.h
deleted file mode 100644
index 24e333f164..0000000000
--- a/arm_compute/core/CL/kernels/CLTableLookupKernel.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_CLTABLELOOKUPKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-class ICLLut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class CLTableLookupKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut    The input LUT. Data types supported: U8, S16.
-     * @param[out] output The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8, S16.
-     * @param[in]  lut             The input LUT. Data types supported: U8, S16.
-     * @param[out] output          The output tensor. Data types supported: U8, S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTABLELOOKUPKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLThresholdKernel.h b/arm_compute/core/CL/kernels/CLThresholdKernel.h
deleted file mode 100644
index 3db48706a3..0000000000
--- a/arm_compute/core/CL/kernels/CLThresholdKernel.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-#define ARM_COMPUTE_CLTHRESHOLDKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the thresholding kernel.
- *
- */
-class CLThresholdKernel : public ICLSimple2DKernel
-{
-public:
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input       An input tensor. Data types supported: U8
-     * @param[out] output      The output tensor. Data types supported: U8.
-     * @param[in]  threshold   Threshold. When the threshold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value value to set when the condition is not respected.
-     * @param[in]  true_value  value to set when the condition is respected.
-     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-    /**Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           An input tensor. Data types supported: U8
-     * @param[out] output          The output tensor. Data types supported: U8.
-     * @param[in]  threshold       Threshold. When the threshold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value     value to set when the condition is not respected.
-     * @param[in]  true_value      value to set when the condition is respected.
-     * @param[in]  type            Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper           Upper threshold. Only used when the thresholding type is RANGE.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLTileKernel.h b/arm_compute/core/CL/kernels/CLTileKernel.h
deleted file mode 100644
index 68f3c929a6..0000000000
--- a/arm_compute/core/CL/kernels/CLTileKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTILEKERNEL_H
-#define ARM_COMPUTE_CLTILEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform a Tile operation */
-class CLTileKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLTileKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLTileKernel(const CLTileKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLTileKernel &operator=(const CLTileKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLTileKernel(CLTileKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLTileKernel &operator=(CLTileKernel &&) = default;
-    /** Default destructor */
-    ~CLTileKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input     Source tensor. Data type supported: All.
-     * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     *                       Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
-     * @param[out] output    Destination tensor. Same as @p input
-     *
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: All.
-     * @param[in]  multiples       Contains the number of times the input tensor should be replicated on the given dimension.
-     *                             Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
-     * @param[out] output          Destination tensor. Same as @p input
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
-     *
-     * @param[in] input     Source tensor info. Data type supported: All.
-     * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     *                      Cannot have more than 4 elements (tiling in dimensions greater than 4 is not supported).
-     * @param[in] output    Destination tensor info. Same as @p input
-     *
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLTILEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLTransposeKernel.h b/arm_compute/core/CL/kernels/CLTransposeKernel.h
deleted file mode 100644
index 09c9e3babf..0000000000
--- a/arm_compute/core/CL/kernels/CLTransposeKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-#define ARM_COMPUTE_CLTRANSPOSEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class CLTransposeKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLTransposeKernel
-     *
-     * @param[in] input  Input tensor. Data types supported: All.
-     * @param[in] output Output tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLTRANSPOSEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h b/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
deleted file mode 100644
index e6b4209501..0000000000
--- a/arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
-#define ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the UpsampleLayer kernel on OpenCL. */
-class CLUpsampleLayerKernel : public ICLKernel
-{
-public:
-    /** Constructor */
-    CLUpsampleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLUpsampleLayerKernel(const CLUpsampleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLUpsampleLayerKernel &operator=(const CLUpsampleLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLUpsampleLayerKernel(CLUpsampleLayerKernel &&) = default;
-    /** Default move assignment operator */
-    CLUpsampleLayerKernel &operator=(CLUpsampleLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLUpsampleLayerKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output            Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info              Contains stride information described in @ref Size2D.
-     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  compile_context   The compile context to be used.
-     * @param[in]  input             Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output            Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info              Contains stride information described in @ref Size2D.
-     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLUpsampleLayerKernel
-     *
-     * @param[in] input             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output            Destination tensor info. Data types supported: same as @p input.
-     * @param[in] info              Contains  stride information described in @ref Size2D.
-     * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy upsampling_policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Size2D           _info;
-    DataLayout       _data_layout;
-    unsigned int     _num_elems_processed_per_iteration_input_x;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLUPSAMPLELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h b/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
deleted file mode 100644
index a21325e1c4..0000000000
--- a/arm_compute/core/CL/kernels/CLWarpAffineKernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-#define ARM_COMPUTE_CLWARPAFFINEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the warp affine kernel.*/
-class CLWarpAffineKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 2x3 of type float
-     *                    The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 2x3 of type float
-     *                             The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPAFFINEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h b/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
deleted file mode 100644
index bb1a018a2b..0000000000
--- a/arm_compute/core/CL/kernels/CLWarpPerspectiveKernel.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-#define ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H
-
-#include "arm_compute/core/CL/ICLSimple2DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-/** Interface for the warp perspective kernel.*/
-class CLWarpPerspectiveKernel : public ICLSimple2DKernel
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U8.
-     * @param[in]  matrix The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U8.
-     * @param[in]  matrix          The perspective matrix. Must be 3x3 of type float.
-     * @param[in]  policy          The interpolation type.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWARPERSPECTIVEKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
deleted file mode 100644
index 47e987b09b..0000000000
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref CLIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class CLWeightsReshapeKernel : public ICLKernel
-{
-public:
-    /** Constructor.*/
-    CLWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel(const CLWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWeightsReshapeKernel &operator=(const CLWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel(CLWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWeightsReshapeKernel &operator=(CLWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~CLWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                        dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                        Data types supported: Same as @p input
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in]  biases          The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output          The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                             Data types supported: Same as @p input
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
-     *
-     * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
-     * @param[in] biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                       dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
-     *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
-     *                       Data types supported: Same as @p input
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    const ICLTensor *_biases;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWEIGHTSRESHAPEKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
deleted file mode 100644
index a39ccc2869..0000000000
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the width concatenate kernel of 2 tensors.
- *  The input1 and input2 tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate2TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate2TensorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate2TensorsKernel &operator=(const CLWidthConcatenate2TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate2TensorsKernel &operator=(CLWidthConcatenate2TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate2TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  input1 First input tensor. Data types supported: All.
-     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate2TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_2TENSORS_KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
deleted file mode 100644
index 0e0eae6e85..0000000000
--- a/arm_compute/core/CL/kernels/CLWidthConcatenate4TensorsKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the width concatenate kernel of 4 tensors.
- *  All input tensors will be concatenated into the output tensor.
- */
-class CLWidthConcatenate4TensorsKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenate4TensorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenate4TensorsKernel &operator=(const CLWidthConcatenate4TensorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenate4TensorsKernel &operator=(CLWidthConcatenate4TensorsKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenate4TensorsKernel() = default;
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  input1 First input tensor. Data types supported: All.
-     * @param[in]  input2 Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3 Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4 Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
-    /** Initialise the kernel's input1s and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: All.
-     * @param[in]  input2          Second input tensor. Data types supported: same as @p input1
-     * @param[in]  input3          Third input tensor. Data types supported: same as @p input1
-     * @param[in]  input4          Fourth input tensor. Data types supported: same as @p input1
-     * @param[out] output          Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, const ICLTensor *input3, const ICLTensor *input4, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenate4TensorsKernel
-     *
-     * @param[in] input1 First tensor info. Data types supported: All.
-     * @param[in] input2 Second tensor info. Data types supported: same as @p input1
-     * @param[in] input3 Third tensor info. Data types supported: same as @p input1
-     * @param[in] input4 Fourth tensor info. Data types supported: same as @p input1
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *input3, const ITensorInfo *input4, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
-    const ICLTensor *_input3;
-    const ICLTensor *_input4;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATE_4TENSORS_KERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h b/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
deleted file mode 100644
index ef5851fa9a..0000000000
--- a/arm_compute/core/CL/kernels/CLWidthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the width concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class CLWidthConcatenateLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWidthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWidthConcatenateLayerKernel &operator=(const CLWidthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel(CLWidthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWidthConcatenateLayerKernel &operator=(CLWidthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLWidthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Input tensor. Data types supported: All.
-     * @param[in]     width_offset    The offset on the X axis.
-     * @param[in,out] output          Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int width_offset, ICLTensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWidthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    unsigned int     _width_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
deleted file mode 100644
index 5b2dc8cfc9..0000000000
--- a/arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd filter transform kernel. */
-class CLWinogradFilterTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradFilterTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradFilterTransformKernel(const CLWinogradFilterTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradFilterTransformKernel &operator=(const CLWinogradFilterTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradFilterTransformKernel(CLWinogradFilterTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradFilterTransformKernel &operator=(CLWinogradFilterTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLWinogradFilterTransformKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output tensor.
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradFilterTransformKernel
-     *
-     * @note Winograd filter transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd filter transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADFILTERTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
deleted file mode 100644
index a305126f2d..0000000000
--- a/arm_compute/core/CL/kernels/CLWinogradInputTransformKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform Winograd input transform.*/
-class CLWinogradInputTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradInputTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradInputTransformKernel(const CLWinogradInputTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradInputTransformKernel &operator=(const CLWinogradInputTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradInputTransformKernel(CLWinogradInputTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradInputTransformKernel &operator=(CLWinogradInputTransformKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output of the kernel.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransformKernel
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16/F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    DataLayout       _data_layout;
-    int              _num_tiles_x;
-    int              _num_tiles_y;
-    unsigned int     _step_z;
-};
-} // arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADINPUTTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h b/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
deleted file mode 100644
index 512b352637..0000000000
--- a/arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-#define ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Winograd output transform kernel. */
-class CLWinogradOutputTransformKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLWinogradOutputTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradOutputTransformKernel(const CLWinogradOutputTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLWinogradOutputTransformKernel &operator=(const CLWinogradOutputTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLWinogradOutputTransformKernel(CLWinogradOutputTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLWinogradOutputTransformKernel &operator=(CLWinogradOutputTransformKernel &&) = default;
-    /** Default destructor */
-    ~CLWinogradOutputTransformKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Set the input and output tensor.
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const WinogradInfo &winograd_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradOutputTransformKernel
-     *
-     * @note Winograd output transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd output transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in]  input         Source tensor with shape [C, N, K, batches]. Data types supported: F16/F32.
-     * @param[in]  bias          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[out] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p input
-     * @param[in]  winograd_info Contains Winograd's information described in @ref WinogradInfo
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation @ref ActivationLayerInfo. Only RELU, BOUNDED_RELU, LU_BOUNDED_RELU, LEAKY_RELU and SOFT_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
-
-    const ICLTensor *_input;
-    const ICLTensor *_bias;
-    ICLTensor       *_output;
-    bool             _is_nhwc;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLWINOGRADOUTPUTTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h b/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
deleted file mode 100644
index d0c4a9e417..0000000000
--- a/arm_compute/core/CL/kernels/CLYOLOLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLYOLOLAYERKERNEL_H
-#define ARM_COMPUTE_CLYOLOLAYERKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the YOLO layer kernel that performs partial activation.
- *  For each box, activate only:
- *    - x and y position (channel 0 and 1 of each box)
- *    - objectiveness    (channel 4 of each box)
- *    - classes          (channel 5 to (classes - 5) of each box)
- */
-class CLYOLOLayerKernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    CLYOLOLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLYOLOLayerKernel(const CLYOLOLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLYOLOLayerKernel &operator=(const CLYOLOLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLYOLOLayerKernel(CLYOLOLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLYOLOLayerKernel &operator=(CLYOLOLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLYOLOLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                             of the activation function. Data types supported: F16/F32.
-     * @param[out]     output      Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info    Activation layer information.
-     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info        Activation layer information.
-     * @param[in]      num_classes     Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLYOLOLayerKernel
-     *
-     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                        of the activation function. Data types supported: F16/F32.
-     * @param[in] output      Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info    Activation layer information.
-     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    ICLTensor *_input;
-    ICLTensor *_output;
-    bool       _run_in_place;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLYOLOLAYERKERNEL_H */
diff --git a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index f0f7754960..0000000000
--- a/arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
- */
-class ICLDepthwiseConvolutionLayer3x3Kernel : public ICLKernel
-{
-public:
-    /** Default constructor */
-    ICLDepthwiseConvolutionLayer3x3Kernel()
-        : _border_size(0), _input(), _output(), _weights(), _biases(), _conv_stride_y(1), _output_multipliers(), _output_shifts(), _is_quantized(false)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDepthwiseConvolutionLayer3x3Kernel(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(const ICLDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    ICLDepthwiseConvolutionLayer3x3Kernel(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    ICLDepthwiseConvolutionLayer3x3Kernel &operator=(ICLDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    virtual void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  input              Source tensor. DataType supported: QASYMM8/F16/F32.
-     * @param[in]  weights            Weights tensor. A 3D tensor with dimensions [3, 3, IFM].
-     *                                Data type supported: Same as @p input, QASYMM8/QSYMM8_PER_CHANNEL when input is QASYMM8.
-     * @param[in]  biases             Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                                Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out] output             Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info          Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier   (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info           (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU are supported for QASYMM8.
-     * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  output_multipliers (Optional) Output multipliers tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     * @param[in]  output_shifts      (Optional) Output shifts tensor for quantized computations. In case of per-channel quantization,
-     *                                the number of multipliers must be equal to the number of filters (IFM). Supported data types: S32
-     */
-    virtual void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U),
-                           const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr) = 0;
-
-protected:
-    BorderSize       _border_size;
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    const ICLTensor *_weights;
-    const ICLTensor *_biases;
-    unsigned int     _conv_stride_y;
-    const ICLTensor *_output_multipliers;
-    const ICLTensor *_output_shifts;
-    bool             _is_quantized;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICLDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index c7b40baf22..f6f36596c4 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,11 +26,8 @@
 
 /* Header regrouping all the CPP kernels */
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h
index d3f6fc944d..e5322bdcb1 100644
--- a/arm_compute/core/CPP/CPPTypes.h
+++ b/arm_compute/core/CPP/CPPTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,111 +21,123 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPP_TYPES_H
-#define ARM_COMPUTE_CPP_TYPES_H
+#ifndef ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
+#define ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
 
 #include "arm_compute/core/Error.h"
 
-#include <array>
-#include <string>
-#include <vector>
+#include <memory>
 
 namespace arm_compute
 {
-/** CPU models - we only need to detect CPUs we have
- * microarchitecture-specific code for.
- *
- * Architecture features are detected via HWCAPs.
- */
-enum class CPUModel
+namespace cpuinfo
 {
-    GENERIC,
-    GENERIC_FP16,
-    GENERIC_FP16_DOT,
-    A53,
-    A55r0,
-    A55r1
-};
-
-/** Global memory policy.
- * The functions in the runtime will use different strategies based on the policy currently set.
- *
- * MINIMIZE will try to reduce the amount allocated by the functions at the expense of performance normally.
- * NORMAL won't try to save any memory and will favor speed over memory consumption
+struct CpuIsaInfo;
+} // namespace cpuinfo
+
+#define ARM_COMPUTE_CPU_MODEL_LIST \
+    X(GENERIC)                     \
+    X(GENERIC_FP16)                \
+    X(GENERIC_FP16_DOT)            \
+    X(A53)                         \
+    X(A55r0)                       \
+    X(A55r1)                       \
+    X(A35)                         \
+    X(A73)                         \
+    X(A76)                         \
+    X(A510)                        \
+    X(X1)                          \
+    X(V1)                          \
+    X(A64FX)                       \
+    X(N1)
+
+/** CPU models types
  *
+ * @note We only need to detect CPUs we have microarchitecture-specific code for.
+ * @note Architecture features are detected via HWCAPs.
  */
-enum class MemoryPolicy
+enum class CPUModel
 {
-    MINIMIZE,
-    NORMAL
+#define X(model) model,
+    ARM_COMPUTE_CPU_MODEL_LIST
+#undef X
 };
 
-/** Convert a cpumodel value to a string
- *
- * @param val CPUModel value to be converted
- *
- * @return String representing the corresponding CPUModel.
- */
-inline std::string cpu_model_to_string(CPUModel val)
-{
-    switch(val)
-    {
-        case CPUModel::GENERIC:
-        {
-            return std::string("GENERIC");
-        }
-        case CPUModel::GENERIC_FP16:
-        {
-            return std::string("GENERIC_FP16");
-        }
-        case CPUModel::GENERIC_FP16_DOT:
-        {
-            return std::string("GENERIC_FP16_DOT");
-        }
-        case CPUModel::A53:
-        {
-            return std::string("A53");
-        }
-        case CPUModel::A55r0:
-        {
-            return std::string("A55r0");
-        }
-        case CPUModel::A55r1:
-        {
-            return std::string("A55r1");
-        }
-        default:
-        {
-            ARM_COMPUTE_ERROR("Invalid CPUModel.");
-            return std::string("GENERIC");
-        }
-    }
-}
-
 class CPUInfo final
 {
-public:
-    /** Constructor */
+protected:
     CPUInfo();
+    ~CPUInfo();
 
-    /** Disable copy constructor and assignment operator to avoid copying the vector of CPUs each time
-     *  CPUInfo is initialized once in the IScheduler and ThreadInfo will get a pointer to it.
+public:
+    /** Access the KernelLibrary singleton.
+     * This method has been deprecated and will be removed in future releases
+     * @return The KernelLibrary instance.
      */
-    CPUInfo &operator=(const CPUInfo &cpuinfo) = delete;
-    CPUInfo(const CPUInfo &cpuinfo)            = delete;
-    CPUInfo &operator=(CPUInfo &&cpuinfo) = default;
-    CPUInfo(CPUInfo &&cpuinfo)            = default;
+    static CPUInfo &get();
+
+    /* Delete move and copy constructors and assignment operator
+    s */
+    CPUInfo(CPUInfo const &)            = delete; // Copy construct
+    CPUInfo(CPUInfo &&)                 = delete; // Move construct
+    CPUInfo &operator=(CPUInfo const &) = delete; // Copy assign
+    CPUInfo &operator=(CPUInfo &&)      = delete; // Move assign
 
     /** Checks if the cpu model supports fp16.
      *
-     * @return true of the cpu supports fp16, false otherwise
+     * @return true if the cpu supports fp16, false otherwise
      */
     bool has_fp16() const;
+    /** Checks if the cpu model supports bf16.
+     *
+     * @return true if the cpu supports bf16, false otherwise
+     */
+    bool has_bf16() const;
+    /** Checks if the cpu model supports bf16.
+     *
+     * @return true if the cpu supports bf16, false otherwise
+     */
+    bool has_svebf16() const;
     /** Checks if the cpu model supports dot product.
      *
-     * @return true of the cpu supports dot product, false otherwise
+     * @return true if the cpu supports dot product, false otherwise
      */
     bool has_dotprod() const;
+    /** Checks if the cpu model supports floating-point matrix multiplication.
+     *
+     * @return true if the cpu supports floating-point matrix multiplication, false otherwise
+     */
+    bool has_svef32mm() const;
+    /** Checks if the cpu model supports integer matrix multiplication.
+     *
+     * @return true if the cpu supports integer matrix multiplication, false otherwise
+     */
+    bool has_i8mm() const;
+    /** Checks if the cpu model supports integer matrix multiplication.
+     *
+     * @return true if the cpu supports integer matrix multiplication, false otherwise
+     */
+    bool has_svei8mm() const;
+    /** Checks if the cpu model supports sve.
+     *
+     * @return true if the cpu supports sve, false otherwise
+     */
+    bool has_sve() const;
+    /** Checks if the cpu model supports sve2.
+     *
+     * @return true if the cpu supports sve2, false otherwise
+     */
+    bool has_sve2() const;
+    /** Checks if the cpu model supports sme.
+     *
+     * @return true if the cpu supports sme, false otherwise
+     */
+    bool has_sme() const;
+    /** Checks if the cpu model supports sme2.
+     *
+     * @return true if the cpu supports sme2, false otherwise
+     */
+    bool has_sme2() const;
     /** Gets the cpu model for a given cpuid.
      *
      * @param[in] cpuid the id of the cpu core to be retrieved,
@@ -138,6 +150,11 @@ public:
      * @return Current thread's @ref CPUModel
      */
     CPUModel get_cpu_model() const;
+    /** Gets the current cpu's ISA information
+     *
+     * @return Current cpu's ISA information
+     */
+    cpuinfo::CpuIsaInfo get_isa() const;
     /** Gets the L1 cache size
      *
      * @return the size of the L1 cache
@@ -148,85 +165,41 @@ public:
      * @return the size of the L1 cache
      */
     unsigned int get_L2_cache_size() const;
-    /** Set the L1 cache size
-     *
-     * @param[in] size the new size to be set.
-     */
-    void set_L1_cache_size(unsigned int size);
-    /** Set the L2 cache size
-     *
-     * @param[in] size the new size to be set.
-     */
-    void set_L2_cache_size(unsigned int size);
-    /** Set fp16 support
-     *
-     * @param[in] fp16 whether the cpu supports fp16.
-     */
-    void set_fp16(const bool fp16);
-    /** Set dot product support
-     *
-     * @param[in] dotprod whether the cpu supports dot product.
-     */
-    void set_dotprod(const bool dotprod);
-    /** Set the cpumodel for a given cpu core
-     *
-     * @param[in] cpuid the id of the core to be set.
-     * @param[in] model the @ref CPUModel to be set.
-     */
-    void set_cpu_model(unsigned int cpuid, CPUModel model);
-    /** Set max number of cpus
-     *
-     * @param[in] cpu_count the number of CPUs in the system.
-     */
-    void set_cpu_num(unsigned int cpu_count);
-
     /** Return the maximum number of CPUs present
      *
      * @return Number of CPUs
      */
     unsigned int get_cpu_num() const;
-
-private:
-    std::vector<CPUModel> _percpu        = {};
-    bool                  _fp16          = false;
-    bool                  _dotprod       = false;
-    unsigned int          _L1_cache_size = 32768;
-    unsigned int          _L2_cache_size = 262144;
-};
-
-class MEMInfo final
-{
-public:
-    MEMInfo();
-
-    /** Return the total amount of RAM memory in the system expressed in KB.
+    /** Return the maximum number of CPUs present excluding the little cores
+     * in case of an Android device
      *
-     * @return Total memory
+     * @return Number of CPUs excluding little
      */
-    size_t get_total_in_kb() const;
-
-    static void set_policy(MemoryPolicy policy);
-    static MemoryPolicy get_policy();
+    unsigned int get_cpu_num_excluding_little() const;
+    /** Return whether the device has little, medium and big CPUs in case
+     * of an Android device, returns false otherwise
+     *
+     * @return Whether the device has little, medium and big CPUs
+     */
+    bool cpu_has_little_mid_big() const;
 
-    /** Common memory sizes expressed in Kb to avoid having them
-     *  duplicated throughout the code.
+    /** Return the vector length in bytes for sme2
+     *
+     * @return Vector length if sme2 is enabled, otherwise returns 0.
      */
-    static const size_t ONE_GB_IN_KB = { 1035842 };
-    static const size_t TWO_GB_IN_KB = { ONE_GB_IN_KB * 2 };
+    unsigned long get_sme2_vector_length() const;
 
 private:
-    size_t              _total;
-    size_t              _free;
-    size_t              _buffer;
-    static MemoryPolicy _policy;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Information about executing thread and CPU. */
 struct ThreadInfo
 {
-    int            thread_id{ 0 };
-    int            num_threads{ 1 };
-    const CPUInfo *cpu_info{ nullptr };
+    int            thread_id{0};
+    int            num_threads{1};
+    const CPUInfo *cpu_info{nullptr};
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPP_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_CPP_CPPTYPES_H
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index ec05af20bd..03967a536d 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,16 +25,21 @@
 #define ARM_COMPUTE_ICPPKERNEL_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 class Window;
+class ITensor;
 
 /** Common interface for all kernels implemented in C++ */
 class ICPPKernel : public IKernel
 {
 public:
+    static constexpr size_t default_mws = 1; /* Default minimum workload size value  - no impact */
+
     /** Default destructor */
     virtual ~ICPPKernel() = default;
 
@@ -51,8 +56,7 @@ public:
      */
     virtual void run(const Window &window, const ThreadInfo &info)
     {
-        ARM_COMPUTE_UNUSED(window);
-        ARM_COMPUTE_UNUSED(info);
+        ARM_COMPUTE_UNUSED(window, info);
         ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked");
     }
 
@@ -69,6 +73,37 @@ public:
         run(window, info);
     }
 
+    /** Execute the kernel on the passed window
+     *
+     * @warning If is_parallelisable() returns false then the passed window must be equal to window()
+     *
+     * @note The window has to be a region within the window returned by the window() method
+     *
+     * @note The width of the window has to be a multiple of num_elems_processed_per_iteration().
+     *
+     * @param[in] tensors A vector containing the tensors to operate on.
+     * @param[in] window  Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info    Info about executing thread and CPU.
+     */
+    virtual void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+    {
+        ARM_COMPUTE_UNUSED(tensors, window, info);
+    }
+
+    /** Return minimum workload size of the relevant kernel
+     *
+     * @param[in] platform     The CPU platform used to create the context.
+     * @param[in] thread_count Number of threads in the execution.
+     *
+     * @return Minimum workload size for requested configuration.
+     */
+    virtual size_t get_mws(const CPUInfo &platform, size_t thread_count) const
+    {
+        ARM_COMPUTE_UNUSED(platform, thread_count);
+
+        return default_mws;
+    }
+
     /** Name of the kernel
      *
      * @return Kernel name
diff --git a/arm_compute/core/CPP/ICPPSimpleKernel.h b/arm_compute/core/CPP/ICPPSimpleKernel.h
deleted file mode 100644
index acdd054c0e..0000000000
--- a/arm_compute/core/CPP/ICPPSimpleKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ICPPSIMPLEKERNEL_H
-#define ARM_COMPUTE_ICPPSIMPLEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for simple C++ kernels having 1 tensor input and 1 tensor output */
-class ICPPSimpleKernel : public ICPPKernel
-{
-public:
-    /** Constructor */
-    ICPPSimpleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICPPSimpleKernel(const ICPPSimpleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    ICPPSimpleKernel &operator=(const ICPPSimpleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    ICPPSimpleKernel(ICPPSimpleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    ICPPSimpleKernel &operator=(ICPPSimpleKernel &&) = default;
-    /** Default destructor */
-    ~ICPPSimpleKernel() = default;
-
-protected:
-    /** Configure the kernel
-     *
-     * @param[in]  input                             Source tensor.
-     * @param[out] output                            Destination tensor.
-     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  border_size                       (Optional) Size of the border.
-     */
-    void configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
-    /** Static function to check if given info will lead to a valid configuration of @ref ICPPSimpleKernel.
-     *
-     * @param[in] input                             Source tensor info.
-     * @param[in] output                            Destination tensor info.
-     * @param[in] num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in] border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in] border_size                       (Optional) Size of the border.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
-                           bool border_undefined = false, const BorderSize &border_size = BorderSize());
-
-protected:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_ICPPSIMPLEKERNEL_H */
diff --git a/arm_compute/core/CPP/Validate.h b/arm_compute/core/CPP/Validate.h
deleted file mode 100644
index dfee9de86e..0000000000
--- a/arm_compute/core/CPP/Validate.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPP_VALIDATE_H
-#define ARM_COMPUTE_CPP_VALIDATE_H
-
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-/** Return an error if the data type of the passed tensor info is FP16 and FP16 support is not compiled in.
- *
- * @param[in] function    Function in which the error occurred.
- * @param[in] file        Name of the file where the error occurred.
- * @param[in] line        Line on which the error occurred.
- * @param[in] tensor_info Tensor info to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::F16,
-                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    return Status {};
-}
-
-/** Return an error if the data type of the passed tensor info is BFLOAT16 and BFLOAT16 support is not compiled in.
- *
- * @param[in] function    Function in which the error occurred.
- * @param[in] file        Name of the file where the error occurred.
- * @param[in] line        Line on which the error occurred.
- * @param[in] tensor_info Tensor info to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-#if !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16))
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(tensor_info->data_type() == DataType::BFLOAT16,
-                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
-#endif /* !(defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)) */
-    return Status {};
-}
-
-/** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- * @param[in] tensor   Tensor to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
-    return Status{};
-}
-
-/** Return an error if the data type of the passed tensor is BFLOAT16 and BFLOAT16 support is not compiled in.
- *
- * @param[in] function Function in which the error occurred.
- * @param[in] file     Name of the file where the error occurred.
- * @param[in] line     Line on which the error occurred.
- * @param[in] tensor   Tensor to validate.
- *
- * @return Status
- */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
-    return Status{};
-}
-
-#define ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
-
-#define ARM_COMPUTE_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
-
-#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPP_VALIDATE_H */
diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
index 3fa83a6d6d..dd91595ea6 100644
--- a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,7 @@
 #ifndef ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 #define ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H
 
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
@@ -62,11 +60,19 @@ public:
      * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[out] batch_splits_out (Optional) The batch splits output tensor [batch_size]. Data types supported: Same as @p scores_in
      * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as@p scores_in
-     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -76,9 +82,9 @@ public:
     void run_nmslimit();
 
 private:
-    const ITensor *_scores_in;
-    const ITensor *_boxes_in;
-    const ITensor *_batch_splits_in;
+    const ITensor  *_scores_in;
+    const ITensor  *_boxes_in;
+    const ITensor  *_batch_splits_in;
     ITensor        *_scores_out;
     ITensor        *_boxes_out;
     ITensor        *_classes;
diff --git a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h b/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
deleted file mode 100644
index eeb6a65525..0000000000
--- a/arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H
-#define ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-/** Interface for CPP Images. */
-using IImage = ITensor;
-
-/** CPP kernel to perform corner candidates
- */
-class CPPCornerCandidatesKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CPPCornerCandidatesKernel";
-    }
-    /** Default constructor */
-    CPPCornerCandidatesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPCornerCandidatesKernel(const CPPCornerCandidatesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPCornerCandidatesKernel &operator=(const CPPCornerCandidatesKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CPPCornerCandidatesKernel(CPPCornerCandidatesKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    CPPCornerCandidatesKernel &operator=(CPPCornerCandidatesKernel &&) = delete;
-    /** Default destructor */
-    ~CPPCornerCandidatesKernel() = default;
-
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input                 Source image (harris score). Format supported F32
-     * @param[out] output                Destination array of InternalKeypoint
-     * @param[out] num_corner_candidates Number of corner candidates
-     */
-    void configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    int32_t           *_num_corner_candidates;   /**< Number of corner candidates */
-    arm_compute::Mutex _corner_candidates_mutex; /**< Mutex to preventing race conditions */
-    const IImage      *_input;                   /**< Source image - Harris score */
-    InternalKeypoint *_output;                   /**< Array of NEInternalKeypoint */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPPCORNERCANDIDATESKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
deleted file mode 100644
index cf8e4f00b9..0000000000
--- a/arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H
-#define ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-/** CPP kernel to perform in-place computation of euclidean distance on IDetectionWindowArray
- *
- * @note This kernel is meant to be used alongside HOG or other object detection algorithms to perform a non-maxima suppression on a
- *       IDetectionWindowArray
- */
-class CPPDetectionWindowNonMaximaSuppressionKernel : public ICPPKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CPPDetectionWindowNonMaximaSuppressionKernel";
-    }
-    /** Default constructor */
-    CPPDetectionWindowNonMaximaSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPDetectionWindowNonMaximaSuppressionKernel(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(const CPPDetectionWindowNonMaximaSuppressionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CPPDetectionWindowNonMaximaSuppressionKernel(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CPPDetectionWindowNonMaximaSuppressionKernel &operator=(CPPDetectionWindowNonMaximaSuppressionKernel &&) = default;
-    /** Initialise the kernel's input, output and the euclidean minimum distance
-     *
-     * @attention: If @ref IDetectionWindowArray is passed to the kernel, the map() and unmap() methods @ref IDetectionWindowArray must be called respectively before and after
-     *             the run() method of @ref CPPDetectionWindowNonMaximaSuppressionKernel
-     *
-     * @param[in, out] input_output Input/Output array of @ref DetectionWindow
-     * @param[in]      min_distance Radial Euclidean distance for non-maxima suppression
-     */
-    void configure(IDetectionWindowArray *input_output, float min_distance);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    IDetectionWindowArray *_input_output;
-    float                  _min_distance;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPPDETECTIONWINDOWNONMAXIMASUPPRESSIONKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
index cb416af070..d1f7f8670f 100644
--- a/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSIONKERNEL_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -65,7 +64,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices, unsigned int max_output_size, const float score_threshold, const float iou_threshold);
+    void configure(const ITensor *input_bboxes,
+                   const ITensor *input_scores,
+                   ITensor       *output_indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    iou_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppressionKernel
      *
@@ -77,8 +81,12 @@ public:
      * @param[in]  iou_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *input_bboxes, const ITensorInfo *input_scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                           const float score_threshold, const float iou_threshold);
+    static Status validate(const ITensorInfo *input_bboxes,
+                           const ITensorInfo *input_scores,
+                           const ITensorInfo *output_indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        iou_threshold);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
index e75152f4ea..d141c2fb70 100644
--- a/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPPermuteKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,15 +56,15 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to permute. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input
      * @param[in]  perm   Permutation vector
      */
     void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
      *
-     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32
-     * @param[in] output The output tensor. Data types supported: Same as @p input
+     * @param[in] input  The input tensor to permute. Data types supported: All.
+     * @param[in] output The output tensor. Data types supported: same as @p input
      * @param[in] perm   Permutation vector
      *
      * @return a status
diff --git a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h b/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
deleted file mode 100644
index d127ef8d8a..0000000000
--- a/arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H
-#define ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-#include "arm_compute/core/IArray.h"
-
-#include <cstdint>
-#include <mutex>
-
-namespace arm_compute
-{
-/** CPP kernel to perform sorting and euclidean distance */
-class CPPSortEuclideanDistanceKernel : public ICPPKernel
-{
-public:
-    const char *name() const override
-    {
-        return "CPPSortEuclideanDistanceKernel";
-    }
-    /** Default constructor */
-    CPPSortEuclideanDistanceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPSortEuclideanDistanceKernel(const CPPSortEuclideanDistanceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CPPSortEuclideanDistanceKernel &operator=(const CPPSortEuclideanDistanceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CPPSortEuclideanDistanceKernel(CPPSortEuclideanDistanceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CPPSortEuclideanDistanceKernel &operator=(CPPSortEuclideanDistanceKernel &&) = default;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in,out] in_out                Input internal keypoints. Marked as out as the kernel writes 0 in the strength member.
-     * @param[out]    output                Output keypoints.
-     * @param[in]     num_corner_candidates Pointer to the number of corner candidates in the input array
-     * @param[in]     min_distance          Radial Euclidean distance to use
-     */
-    void configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    const int32_t    *_num_corner_candidates; /**< Number of corner candidates */
-    float             _min_distance;          /**< Radial Euclidean distance */
-    InternalKeypoint *_in_out;                /**< Source array of InternalKeypoint */
-    IKeyPointArray   *_output;                /**< Destination array of IKeyPointArray */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CPPSORTEUCLIDEANDISTANCEKERNEL_H */
diff --git a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
index 4b9bfdd3c9..7326a10e2f 100644
--- a/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPTopKVKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: S32
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
      * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
      * @param[in]  k           Number of top elements to look at for computing precision.
      */
@@ -63,13 +63,14 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel
      *
      * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: S32
+     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: U32
      * @param[in] output      Computed precision at @p k as a bool 1D tensor info. Data types supported: U8
      * @param[in] k           Number of top elements to look at for computing precision.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
index 9fbc9b697c..dd7e07c390 100644
--- a/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
+++ b/arm_compute/core/CPP/kernels/CPPUpsampleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -55,8 +55,8 @@ public:
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to upsample. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input.
      * @param[in]  info   Padding info.
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
diff --git a/arm_compute/core/Coordinates.h b/arm_compute/core/Coordinates.h
index 78ca5250ab..d1240bb10a 100644
--- a/arm_compute/core/Coordinates.h
+++ b/arm_compute/core/Coordinates.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,8 +42,7 @@ public:
      * @param[in] coords Values to initialize the dimensions.
      */
     template <typename... Ts>
-    constexpr Coordinates(Ts... coords)
-        : Dimensions{ coords... }
+    constexpr Coordinates(Ts... coords) : Dimensions{coords...}
     {
     }
     /** Allow instances of this class to be copy constructed */
@@ -57,5 +56,5 @@ public:
     /** Default destructor */
     ~Coordinates() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_COORDINATES_H*/
diff --git a/arm_compute/core/CoreTypes.h b/arm_compute/core/CoreTypes.h
new file mode 100644
index 0000000000..1a9db1937c
--- /dev/null
+++ b/arm_compute/core/CoreTypes.h
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_CORE_CORETYPES
+#define ACL_ARM_COMPUTE_CORE_CORETYPES
+
+#include "arm_compute/core/Strides.h"
+
+#include "support/Half.h"
+
+/** CoreTypes.h groups together essential small types that are used across functions */
+
+namespace arm_compute
+{
+/** 16-bit floating point type */
+using half = half_float::half;
+/** Permutation vector */
+using PermutationVector = Strides;
+
+/** Available channels */
+enum class Channel
+{
+    UNKNOWN, /** Unknown channel format */
+    C0,      /**< First channel (used by formats with unknown channel types). */
+    C1,      /**< Second channel (used by formats with unknown channel types). */
+    C2,      /**< Third channel (used by formats with unknown channel types). */
+    C3,      /**< Fourth channel (used by formats with unknown channel types). */
+    R,       /**< Red channel. */
+    G,       /**< Green channel. */
+    B,       /**< Blue channel. */
+    A,       /**< Alpha channel. */
+    Y,       /**< Luma channel. */
+    U,       /**< Cb/U channel. */
+    V        /**< Cr/V/Value channel. */
+};
+
+/** Image colour formats */
+enum class Format
+{
+    UNKNOWN,  /**< Unknown image format */
+    U8,       /**< 1 channel, 1 U8 per channel */
+    S16,      /**< 1 channel, 1 S16 per channel */
+    U16,      /**< 1 channel, 1 U16 per channel */
+    S32,      /**< 1 channel, 1 S32 per channel */
+    U32,      /**< 1 channel, 1 U32 per channel */
+    S64,      /**< 1 channel, 1 S64 per channel */
+    U64,      /**< 1 channel, 1 U64 per channel */
+    BFLOAT16, /**< 16-bit brain floating-point number */
+    F16,      /**< 1 channel, 1 F16 per channel */
+    F32,      /**< 1 channel, 1 F32 per channel */
+    UV88,     /**< 2 channel, 1 U8 per channel */
+    RGB888,   /**< 3 channels, 1 U8 per channel */
+    RGBA8888, /**< 4 channels, 1 U8 per channel */
+    YUV444,   /**< A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
+    YUYV422,  /**< A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
+    NV12,     /**< A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
+    NV21,     /**< A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
+    IYUV,     /**< A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
+    UYVY422   /**< A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
+};
+
+/** Available data types */
+enum class DataType
+{
+    UNKNOWN,            /**< Unknown data type */
+    U8,                 /**< unsigned 8-bit number */
+    S8,                 /**< signed 8-bit number */
+    QSYMM8,             /**< quantized, symmetric fixed-point 8-bit number */
+    QASYMM8,            /**< quantized, asymmetric fixed-point 8-bit number unsigned */
+    QASYMM8_SIGNED,     /**< quantized, asymmetric fixed-point 8-bit number signed */
+    QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
+    U16,                /**< unsigned 16-bit number */
+    S16,                /**< signed 16-bit number */
+    QSYMM16,            /**< quantized, symmetric fixed-point 16-bit number */
+    QASYMM16,           /**< quantized, asymmetric fixed-point 16-bit number */
+    U32,                /**< unsigned 32-bit number */
+    S32,                /**< signed 32-bit number */
+    U64,                /**< unsigned 64-bit number */
+    S64,                /**< signed 64-bit number */
+    BFLOAT16,           /**< 16-bit brain floating-point number */
+    F16,                /**< 16-bit floating-point number */
+    F32,                /**< 32-bit floating-point number */
+    F64,                /**< 64-bit floating-point number */
+    SIZET               /**< size_t */
+};
+
+/** [DataLayout enum definition] **/
+
+/** Supported tensor data layouts */
+enum class DataLayout
+{
+    UNKNOWN, /**< Unknown data layout */
+    NCHW,    /**< Num samples, channels, height, width */
+    NHWC,    /**< Num samples, height, width, channels */
+    NCDHW,   /**< Num samples, channels, depth, height, width */
+    NDHWC    /**< Num samples, depth, height, width, channels */
+};
+/** [DataLayout enum definition] **/
+
+/** Supported tensor data layout dimensions */
+enum class DataLayoutDimension
+{
+    CHANNEL, /**< channel */
+    HEIGHT,  /**< height */
+    WIDTH,   /**< width */
+    DEPTH,   /**< depth */
+    BATCHES  /**< batches */
+};
+
+/** Dimension rounding type when down-scaling on CNNs
+ * @note Used in pooling and convolution layer
+ */
+enum class DimensionRoundingType
+{
+    FLOOR, /**< Floor rounding */
+    CEIL   /**< Ceil rounding */
+};
+
+class PadStrideInfo
+{
+public:
+    /** Constructor
+     *
+     * @param[in] stride_x (Optional) Stride, in elements, across x. Defaults to 1.
+     * @param[in] stride_y (Optional) Stride, in elements, across y. Defaults to 1.
+     * @param[in] pad_x    (Optional) Padding, in elements, across x. Defaults to 0.
+     * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
+     * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR.
+     */
+    PadStrideInfo(unsigned int          stride_x = 1,
+                  unsigned int          stride_y = 1,
+                  unsigned int          pad_x    = 0,
+                  unsigned int          pad_y    = 0,
+                  DimensionRoundingType round    = DimensionRoundingType::FLOOR)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad_left(pad_x),
+          _pad_top(pad_y),
+          _pad_right(pad_x),
+          _pad_bottom(pad_y),
+          _round_type(round)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] stride_x   Stride, in elements, across x.
+     * @param[in] stride_y   Stride, in elements, across y.
+     * @param[in] pad_left   Padding across x on the left, in elements.
+     * @param[in] pad_right  Padding across x on the right, in elements.
+     * @param[in] pad_top    Padding across y on the top, in elements.
+     * @param[in] pad_bottom Padding across y on the bottom, in elements.
+     * @param[in] round      Dimensions rounding.
+     */
+    PadStrideInfo(unsigned int          stride_x,
+                  unsigned int          stride_y,
+                  unsigned int          pad_left,
+                  unsigned int          pad_right,
+                  unsigned int          pad_top,
+                  unsigned int          pad_bottom,
+                  DimensionRoundingType round)
+        : _stride(std::make_pair(stride_x, stride_y)),
+          _pad_left(pad_left),
+          _pad_top(pad_top),
+          _pad_right(pad_right),
+          _pad_bottom(pad_bottom),
+          _round_type(round)
+    {
+    }
+    /** Get the stride.
+     *
+     * @return a pair: stride x, stride y.
+     */
+    std::pair<unsigned int, unsigned int> stride() const
+    {
+        return _stride;
+    }
+    /** Check whether the padding is symmetric.
+     *
+     * @return True if the padding is symmetric.
+     */
+    bool padding_is_symmetric() const
+    {
+        return (_pad_left == _pad_right) && (_pad_top == _pad_bottom);
+    }
+    /** Get the padding.
+     *
+     * @note This should only be used when the padding is symmetric.
+     *
+     * @return a pair: padding left/right, padding top/bottom
+     */
+    std::pair<unsigned int, unsigned int> pad() const
+    {
+        //this accessor should be used only when padding is symmetric
+        ARM_COMPUTE_ERROR_ON(!padding_is_symmetric());
+        return std::make_pair(_pad_left, _pad_top);
+    }
+
+    /** Get the left padding */
+    unsigned int pad_left() const
+    {
+        return _pad_left;
+    }
+    /** Get the right padding */
+    unsigned int pad_right() const
+    {
+        return _pad_right;
+    }
+    /** Get the top padding */
+    unsigned int pad_top() const
+    {
+        return _pad_top;
+    }
+    /** Get the bottom padding */
+    unsigned int pad_bottom() const
+    {
+        return _pad_bottom;
+    }
+
+    /** Get the rounding type */
+    DimensionRoundingType round() const
+    {
+        return _round_type;
+    }
+
+    /** Check whether this has any padding */
+    bool has_padding() const
+    {
+        return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0);
+    }
+
+private:
+    std::pair<unsigned int, unsigned int> _stride;
+    unsigned int                          _pad_left;
+    unsigned int                          _pad_top;
+    unsigned int                          _pad_right;
+    unsigned int                          _pad_bottom;
+
+    DimensionRoundingType _round_type;
+};
+
+/** Memory layouts for the weights tensor.
+ *
+ * * UNSPECIFIED is used to select kernels that do not run in
+ *    variable weights mode.
+ *
+ * * ANY is used to query the kernel database to retrieve any of the
+ *   kernels that runs in variable weights mode. Once a kernel is
+ *   found, the specific format expected by the kernel can be
+ *   retrieved by the user for reordering the weights tensor
+ *   accordingly.
+ *
+ * The other values OHWIo{interleave_by}i{block_by} describe the
+ * memory layout of a 4D tensor with layout OHWI that has been
+ * transformed into a 4D tensor with dimensions O'HWI' where:
+ *
+ * O' = first multiple of {interleave_by} s.t. O<=O'
+ * I' = first multiple of {block_by} s.t. I<=I'
+ *
+ * The total size of the dst tensor is O' x H x W x I'
+ *
+ * The access function of the tensor with layout
+ * OHWIo{interleave_by}i{block_by} and size O'HWI' is a 6-parameter
+ * access function, where the 6 parameters are computed as follows:
+ *
+ * x5 = floor(o/{interleave_by}) RANGE [0, O'/{interleave_by} -1] SIZE: O'/{interleave_by}
+ *
+ * x4 = h                        RANGE [0, H-1]                   SIZE: H
+ * x3 = w                        RANGE [0, W-1]                   SIZE: W
+ * x2 = floor(i/{block_by})      RANGE [0, I'/{block_by} -1]      SIZE: I'/{block_by}
+ * x1 = o%{interleave_by}        RANGE [0, {interleave_by} -1]    SIZE: {interleave_by}
+ * x0 = i%{block_by}             RANGE [0, {block_by} -1]         SIZE: {block_by}
+ *                                                          TOTAL SIZE: O' * H * W * I'
+ *
+ *        4D                       6D
+ * -----------------   -----------------------------------
+ * value(o, h, w, i) =   x5 * H * W * I' * {interleave_by}
+ *                     + x4 * W * I' * {interleave_by}
+ *                     + x3 * I' * {interleave_by}
+ *                     + x2 * {interleave_by} * {block_by}
+ *                     + x1 * {block_by}
+ *                     + x0
+ *
+ * Notice that in arm_gemm the 4D tensor of dimension O'HWI' created
+ * for the OHWIo{interleave_by}i{block_by} format is in reality seen
+ * as a 2D tensor, where the number of rows is O'/{interleave_by}
+ * and the number of columns is {interleave_by} * H * W * I'.
+ *
+ * The postfix *_bf16 is for the memory layout needed for the
+ * fast-mode kernels, in which the weights are passed in bfloat16
+ * format.
+ */
+enum class WeightFormat
+{
+    UNSPECIFIED    = 0x1,
+    ANY            = 0x2,
+    OHWI           = 0x100100,
+    OHWIo2         = 0x100200,
+    OHWIo4         = 0x100400,
+    OHWIo8         = 0x100800,
+    OHWIo16        = 0x101000,
+    OHWIo32        = 0x102000,
+    OHWIo64        = 0x104000,
+    OHWIo128       = 0x108000,
+    OHWIo4i2       = 0x200400,
+    OHWIo4i2_bf16  = 0x200410,
+    OHWIo8i2       = 0x200800,
+    OHWIo8i2_bf16  = 0x200810,
+    OHWIo16i2      = 0x201000,
+    OHWIo16i2_bf16 = 0x201010,
+    OHWIo32i2      = 0x202000,
+    OHWIo32i2_bf16 = 0x202010,
+    OHWIo64i2      = 0x204000,
+    OHWIo64i2_bf16 = 0x204010,
+    OHWIo4i4       = 0x400400,
+    OHWIo4i4_bf16  = 0x400410,
+    OHWIo8i4       = 0x400800,
+    OHWIo8i4_bf16  = 0x400810,
+    OHWIo16i4      = 0x401000,
+    OHWIo16i4_bf16 = 0x401010,
+    OHWIo32i4      = 0x402000,
+    OHWIo32i4_bf16 = 0x402010,
+    OHWIo64i4      = 0x404000,
+    OHWIo64i4_bf16 = 0x404010,
+    OHWIo2i8       = 0x800200,
+    OHWIo4i8       = 0x800400,
+    OHWIo8i8       = 0x800800,
+    OHWIo16i8      = 0x801000,
+    OHWIo32i8      = 0x802000,
+    OHWIo64i8      = 0x804000
+};
+
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_CORE_CORETYPES */
diff --git a/arm_compute/core/Dimensions.h b/arm_compute/core/Dimensions.h
index fbaef3a8f0..bb8692d70a 100644
--- a/arm_compute/core/Dimensions.h
+++ b/arm_compute/core/Dimensions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include <algorithm>
 #include <array>
 #include <functional>
+#include <limits>
 #include <numeric>
 
 namespace arm_compute
@@ -49,8 +50,7 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    explicit Dimensions(Ts... dims)
-        : _id{ { static_cast<T>(dims)... } }, _num_dimensions{ sizeof...(dims) }
+    explicit Dimensions(Ts... dims) : _id{{static_cast<T>(dims)...}}, _num_dimensions{sizeof...(dims)}
     {
     }
 
@@ -68,14 +68,19 @@ public:
 
     /** Accessor to set the value of one of the dimensions.
      *
-     * @param[in] dimension Dimension for which the value is set.
-     * @param[in] value     Value to be set for the dimension.
+     * @param[in] dimension         Dimension for which the value is set.
+     * @param[in] value             Value to be set for the dimension.
+     * @param[in] increase_dim_unit (Optional) Set to true if new unit dimensions increase the number of dimensions (e.g. for Coordinates), false otherwise (e.g. for TensorShapes)
      */
-    void set(size_t dimension, T value)
+    void set(size_t dimension, T value, bool increase_dim_unit = true)
     {
         ARM_COMPUTE_ERROR_ON(dimension >= num_max_dimensions);
-        _id[dimension]  = value;
-        _num_dimensions = std::max(_num_dimensions, dimension + 1);
+        _id[dimension] = value;
+        // Don't increase the number of dimensions if the new dimension is 1
+        if (increase_dim_unit || value != 1)
+        {
+            _num_dimensions = std::max(_num_dimensions, dimension + 1);
+        }
     }
     /** Alias to access the size of the first dimension */
     T x() const
@@ -92,6 +97,21 @@ public:
     {
         return _id[2];
     }
+    /** Increments the given dimension by a step size, avoiding overflows
+     *
+     * @note Precondition: dim < _num_dimensions
+     *
+     * @param[in] dim  Dimension to increment.
+     * @param[in] step Step to increment @p dim by.
+     */
+    void increment(size_t dim, T step = 1)
+    {
+        ARM_COMPUTE_ERROR_ON(dim >= _num_dimensions);
+        if ((std::numeric_limits<T>::max() - _id[dim]) >= step)
+        {
+            _id[dim] += step;
+        }
+    }
     /** Generic accessor to get the size of any dimension
      *
      * @note Precondition: dimension < Dimensions::num_max_dimensions
@@ -141,7 +161,7 @@ public:
 
         const size_t last = std::min(_num_dimensions, first + n);
 
-        if(last > (first + 1))
+        if (last > (first + 1))
         {
             // Collapse dimensions into the first
             _id[first] = std::accumulate(&_id[first], &_id[last], 1, std::multiplies<T>());
@@ -175,7 +195,7 @@ public:
     void remove(size_t idx)
     {
         ARM_COMPUTE_ERROR_ON(_num_dimensions < 1);
-        if(idx >= _num_dimensions)
+        if (idx >= _num_dimensions)
         {
             return;
         }
@@ -241,7 +261,7 @@ protected:
     ~Dimensions() = default;
 
     std::array<T, num_max_dimensions> _id;
-    size_t _num_dimensions{ 0 };
+    size_t                            _num_dimensions{0};
 };
 
 /** Check that given dimensions are equal.
@@ -268,5 +288,5 @@ inline bool operator!=(const Dimensions<T> &lhs, const Dimensions<T> &rhs)
 {
     return !(lhs == rhs);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_DIMENSIONS_H*/
diff --git a/arm_compute/core/Error.h b/arm_compute/core/Error.h
index dd3e8889bc..7a7033805a 100644
--- a/arm_compute/core/Error.h
+++ b/arm_compute/core/Error.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019, 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,8 +53,7 @@ class Status
 {
 public:
     /** Default Constructor **/
-    Status()
-        : _code(ErrorCode::OK), _error_description(" ")
+    Status() : _code(ErrorCode::OK), _error_description(" ")
     {
     }
     /** Default Constructor
@@ -101,7 +100,7 @@ public:
     /** Throws a runtime exception in case it contains a valid error status */
     void throw_if_error() const
     {
-        if(!bool(*this))
+        if (!bool(*this))
         {
             internal_throw_on_error();
         }
@@ -119,7 +118,7 @@ private:
 /** Creates an error containing the error message
  *
  * @param[in] error_code Error code
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  *
  * @return status containing the error
  */
@@ -131,7 +130,7 @@ Status create_error(ErrorCode error_code, std::string msg);
  * @param[in] func       Function in which the error occurred.
  * @param[in] file       File in which the error occurred.
  * @param[in] line       Line in which the error occurred.
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  *
  * @return status containing the error
  */
@@ -141,7 +140,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] err Error status
  */
 [[noreturn]] void throw_error(Status err);
-}
+} // namespace arm_compute
 /** To avoid unused variables warnings
  *
  * This is useful if for example a variable is only used
@@ -156,7 +155,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] error_code Error code.
  * @param[in] msg        Message to encapsulate.
  */
-#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
+#define ARM_COMPUTE_CREATE_ERROR(error_code, msg) \
+    arm_compute::create_error_msg(error_code, __func__, __FILE__, __LINE__, msg)
 
 /** Creates an error on location with a given message
  *
@@ -164,9 +164,10 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] func       Function in which the error occurred.
  * @param[in] file       File in which the error occurred.
  * @param[in] line       Line in which the error occurred.
- * @param[in] msg        Message to display before aborting.
+ * @param[in] msg        Message to display before abandoning.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) arm_compute::create_error_msg(error_code, func, file, line, msg)
+#define ARM_COMPUTE_CREATE_ERROR_LOC(error_code, func, file, line, msg) \
+    arm_compute::create_error_msg(error_code, func, file, line, msg)
 
 /** Creates an error on location with a given message. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -178,14 +179,14 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg        Error description message format.
  * @param[in] ...        List of arguments matching the format description.
  */
-#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)          \
-    do                                                                                    \
-    {                                                                                     \
-        std::array<char, 512> out{ 0 };                                                   \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
-        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);             \
-        arm_compute::create_error(error_code, std::string(out.data()));                   \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ERROR_LOC_VAR(error_code, func, file, line, msg, ...)                            \
+    do                                                                                                      \
+    {                                                                                                       \
+        std::array<char, 512> out{0};                                                                       \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+        snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+        arm_compute::create_error(error_code, std::string(out.data()));                                     \
+    } while (false)
 
 /** An error is returned with the given description.
  *
@@ -195,7 +196,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
     do                                                                                       \
     {                                                                                        \
         return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Checks if a status contains an error and returns it
  *
@@ -204,18 +205,18 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ON_ERROR(status) \
     do                                      \
     {                                       \
-        if(!bool(status))                   \
+        const auto s = status;              \
+        if (!bool(s))                       \
         {                                   \
-            return status;                  \
+            return s;                       \
         }                                   \
-    } while(false)
+    } while (false)
 
 /** Checks if an error value is valid if not throws an exception with the error
  *
  * @param[in] error Error value to check.
  */
-#define ARM_COMPUTE_THROW_ON_ERROR(error) \
-    error.throw_if_error();
+#define ARM_COMPUTE_THROW_ON_ERROR(error) error.throw_if_error();
 
 /** If the condition is true, an error is returned. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -227,28 +228,29 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(cond, msg, ...)                                                   \
     do                                                                                                        \
     {                                                                                                         \
-        if(cond)                                                                                              \
+        if (cond)                                                                                             \
         {                                                                                                     \
-            std::array<char, 512> out{ 0 };                                                                   \
+            std::array<char, 512> out{0};                                                                     \
             int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", __func__, __FILE__, __LINE__);     \
             snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                             \
             return arm_compute::create_error(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
         }                                                                                                     \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Error description message
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                          \
-    do                                                                                                                      \
-    {                                                                                                                       \
-        if(cond)                                                                                                            \
-        {                                                                                                                   \
-            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, msg); \
-        }                                                                                                                   \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)                                                                    \
+    do                                                                                                                \
+    {                                                                                                                 \
+        if (cond)                                                                                                     \
+        {                                                                                                             \
+            return arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, __func__, __FILE__, __LINE__, \
+                                                 msg);                                                                \
+        }                                                                                                             \
+    } while (false)
 
 /** If the condition is true, an error is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -260,17 +262,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                \
-    do                                                                                           \
-    {                                                                                            \
-        if(cond)                                                                                 \
-        {                                                                                        \
-            std::array<char, 512> out{ 0 };                                                      \
-            int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);    \
-            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                \
-            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data())); \
-        }                                                                                        \
-    } while(false)
+#define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(cond, func, file, line, msg, ...)                               \
+    do                                                                                                          \
+    {                                                                                                           \
+        if (cond)                                                                                               \
+        {                                                                                                       \
+            std::array<char, 512> out{0};                                                                       \
+            int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line); \
+            snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                               \
+            return arm_compute::create_error(ErrorCode::RUNTIME_ERROR, std::string(out.data()));                \
+        }                                                                                                       \
+    } while (false)
 
 /** If the condition is true, an error is thrown.
  *
@@ -283,18 +285,17 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(cond, func, file, line, msg)                           \
     do                                                                                             \
     {                                                                                              \
-        if(cond)                                                                                   \
+        if (cond)                                                                                  \
         {                                                                                          \
             return arm_compute::create_error_msg(ErrorCode::RUNTIME_ERROR, func, file, line, msg); \
         }                                                                                          \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, an error is returned
  *
  * @param[in] cond Condition to evaluate
  */
-#define ARM_COMPUTE_RETURN_ERROR_ON(cond) \
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_RETURN_ERROR_ON(cond) ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true, an error is returned
  *
@@ -313,11 +314,12 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] line Line in which the error occurred.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                                         \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        arm_compute::throw_error(arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
-    } while(false)
+#define ARM_COMPUTE_THROW_ERROR(func, file, line, msg)                                                    \
+    do                                                                                                    \
+    {                                                                                                     \
+        arm_compute::throw_error(                                                                         \
+            arm_compute::create_error_msg(arm_compute::ErrorCode::RUNTIME_ERROR, func, file, line, msg)); \
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -331,11 +333,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, ...)                                                        \
     do                                                                                                                 \
     {                                                                                                                  \
-        std::array<char, 512> out{ 0 };                                                                                \
-        int offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);                              \
+        std::array<char, 512> out{0};                                                                                  \
+        int                   offset = snprintf(out.data(), out.size(), "in %s %s:%d: ", func, file, line);            \
         snprintf(out.data() + offset, out.size() - offset, msg, __VA_ARGS__);                                          \
         arm_compute::throw_error(arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, std::string(out.data()))); \
-    } while(false)
+    } while (false)
 
 /** Print the given message then throw an std::runtime_error. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -360,7 +362,8 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
+#define ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, msg, ...) \
+    ARM_COMPUTE_THROW_ERROR_VAR(func, file, line, msg, __VA_ARGS__) // NOLINT
 
 /** Print the given message then throw an std::runtime_error.
  *
@@ -379,11 +382,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG(cond, msg) \
     do                                     \
     {                                      \
-        if(cond)                           \
+        if (cond)                          \
         {                                  \
             ARM_COMPUTE_ERROR(msg);        \
         }                                  \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and program exits. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -395,27 +398,25 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, ...)  \
     do                                               \
     {                                                \
-        if(cond)                                     \
+        if (cond)                                    \
         {                                            \
             ARM_COMPUTE_ERROR_VAR(msg, __VA_ARGS__); \
         }                                            \
-    } while(false)
+    } while (false)
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Checks if a status value is valid if not throws an exception with the error
  *
  * @param[in] status Status value to check.
  */
-#define ARM_COMPUTE_ERROR_THROW_ON(status) \
-    status.throw_if_error()
+#define ARM_COMPUTE_ERROR_THROW_ON(status) status.throw_if_error()
 
 /** If the condition is true, the given message is printed and an exception is thrown
  *
  * @param[in] cond Condition to evaluate.
  * @param[in] msg  Message to display.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) \
-    ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
+#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg) ARM_COMPUTE_EXIT_ON_MSG(cond, msg)
 
 /** If the condition is true, the given message is printed and an exception is thrown. Accepts a message format
  *  and a variable list of arguments matching the format description.
@@ -424,8 +425,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  * @param[in] msg  Error description message format.
  * @param[in] ...  List of arguments matching the format description.
  */
-#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) \
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_MSG_VAR(cond, msg, ...) ARM_COMPUTE_EXIT_ON_MSG_VAR(cond, msg, __VA_ARGS__)
 
 /** If the condition is true, the given message is printed and an exception is thrown.
  *
@@ -438,11 +438,11 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
 #define ARM_COMPUTE_ERROR_ON_LOC_MSG(cond, func, file, line, ...)     \
     do                                                                \
     {                                                                 \
-        if(cond)                                                      \
+        if (cond)                                                     \
         {                                                             \
             ARM_COMPUTE_ERROR_LOC_VAR(func, file, line, __VA_ARGS__); \
         }                                                             \
-    } while(false)
+    } while (false)
 
 /** If the condition is true, the given message is printed and an exception is thrown, otherwise value is returned
  *
@@ -463,8 +463,7 @@ Status create_error_msg(ErrorCode error_code, const char *func, const char *file
  *
  * @param[in] cond Condition to evaluate.
  */
-#define ARM_COMPUTE_ERROR_ON(cond) \
-    ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
+#define ARM_COMPUTE_ERROR_ON(cond) ARM_COMPUTE_ERROR_ON_MSG(cond, #cond)
 
 /** If the condition is true then an error message is printed and an exception thrown
  *
diff --git a/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h b/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h
deleted file mode 100644
index 9706c9b3a6..0000000000
--- a/arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCCORERUNTIME_CONTEXT_H
-#define ARM_COMPUTE_GCCORERUNTIME_CONTEXT_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class GCKernelLibrary;
-
-/** Core runtime context for OpenGL ES */
-class GCCoreRuntimeContext final
-{
-public:
-    /** Legacy constructor */
-    GCCoreRuntimeContext();
-
-    /** Constructor */
-    GCCoreRuntimeContext(GCKernelLibrary *kernel_lib);
-    /** Destructor */
-    ~GCCoreRuntimeContext() = default;
-    /** Default copy constructor */
-    GCCoreRuntimeContext(const GCCoreRuntimeContext &) = default;
-    /** Default move constructor */
-    GCCoreRuntimeContext(GCCoreRuntimeContext &&) = default;
-    /** Default copy assignment */
-    GCCoreRuntimeContext &operator=(const GCCoreRuntimeContext &) = default;
-    /** Default move assignment operator */
-    GCCoreRuntimeContext &operator=(GCCoreRuntimeContext &&) = default;
-    /** Kernel Library accessor
-     *
-     * @return The kernel library instance used by the core context
-     */
-    GCKernelLibrary *kernel_library() const;
-
-private:
-    GCKernelLibrary *_kernel_lib{ nullptr };
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCCORERUNTIME_CONTEXT_H */
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h b/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
deleted file mode 100644
index 0f6daf786b..0000000000
--- a/arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCKERNELLIBRARY_H
-#define ARM_COMPUTE_GCKERNELLIBRARY_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/Utils.h"
-
-#include <map>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace arm_compute
-{
-/** GCProgram class */
-class GCProgram final
-{
-public:
-    /** Default constructor. */
-    GCProgram();
-    /** Construct program from source file.
-     *
-     * @param[in] name   Program name.
-     * @param[in] source Program source.
-     */
-    GCProgram(std::string name, std::string source);
-    /** Default Copy Constructor. */
-    GCProgram(const GCProgram &) = default;
-    /** Default Move Constructor. */
-    GCProgram(GCProgram &&) = default;
-    /** Default copy assignment operator */
-    GCProgram &operator=(const GCProgram &) = default;
-    /** Default move assignment operator */
-    GCProgram &operator=(GCProgram &&) = default;
-    /** Returns program name.
-     *
-     * @return Program's name.
-     */
-    std::string name() const
-    {
-        return _name;
-    }
-    /** Link program.
-     *
-     * @param[in] shader Shader used to link program.
-     *
-     * @return linked program id .
-     */
-    GLuint link_program(GLuint shader);
-    /** Compile shader.
-     *
-     * @param[in] build_options Shader build options.
-     *
-     * @return GLES shader object.
-     */
-    GLuint compile_shader(const std::string &build_options);
-
-private:
-    std::string _name;   /**< Program name. */
-    std::string _source; /**< Source code for the program. */
-};
-
-/** GCKernel class */
-class GCKernel final
-{
-public:
-    /** Default Constructor. */
-    GCKernel();
-    /** Default Copy Constructor. */
-    GCKernel(const GCKernel &) = default;
-    /** Default Move Constructor. */
-    GCKernel(GCKernel &&) = default;
-    /** Default copy assignment operator */
-    GCKernel &operator=(const GCKernel &) = default;
-    /** Default move assignment operator */
-    GCKernel &operator=(GCKernel &&) = default;
-    /** Constructor.
-     *
-     * @param[in] name    Kernel name.
-     * @param[in] program Built program.
-     */
-    GCKernel(std::string name, GLuint program);
-    /** Destructor.
-     */
-    ~GCKernel();
-    /** Returns kernel name.
-     *
-     * @return Kernel's name.
-     */
-    std::string name() const
-    {
-        return _name;
-    }
-    /** Get program id.
-     *
-     * @return program id.
-     */
-    GLuint get_program() const
-    {
-        return _program;
-    }
-    /** Use current program.
-     *
-     * @return program id.
-     */
-    void use();
-    /** Unuse current program.
-     *
-     * @return program id.
-     */
-    void unuse();
-    /** Set argument value at index of shader params.
-     *
-     * @param[in] idx   Index in shader params.
-     * @param[in] value Argument value to be set.
-     */
-    template <class T>
-    void set_argument(unsigned int idx, T value)
-    {
-        if(idx >= _shader_arguments.size())
-        {
-            _shader_arguments.resize(idx + 1, 0);
-        }
-
-        unsigned int *p        = reinterpret_cast<unsigned int *>(&value);
-        _shader_arguments[idx] = *p;
-    }
-    /** Clear shader arguments.
-     *
-     */
-    void clear_arguments()
-    {
-        _shader_arguments.clear();
-    }
-    /** Set shader params binding point.
-     *
-     * @param[in] binding Shader params binding point.
-     */
-    void set_shader_params_binding_point(unsigned int binding)
-    {
-        _shader_params_binding_point = binding;
-    }
-    /** Update shader params.
-     *
-     */
-    void update_shader_params();
-    /** Clean up program and ubo.
-     *
-     */
-    void cleanup();
-
-private:
-    std::string                  _name;                                 /**< Kernel name */
-    GLuint                       _program;                              /**< Linked program id */
-    std::vector<unsigned int>    _shader_arguments;                     /**< Store all the values of the shader arguments */
-    GLuint                       _shader_params_ubo_name;               /**< Uniform buffer object name for shader parameters */
-    GLuint                       _shader_params_binding_point;          /**< The binding point of the uniform block for shader parameters */
-    GLuint                       _shader_params_index;                  /**< The index of the uniform block */
-    GLint                        _shader_params_size;                   /**< The uniform block data size in the shader */
-    static constexpr const char *_shader_params_name = "shader_params"; /**< The uniform block name in the shader */
-};
-
-/** GCKernelLibrary class */
-class GCKernelLibrary final
-{
-    using StringSet = std::set<std::string>;
-
-public:
-    /** Default Constructor. */
-    GCKernelLibrary();
-    /** Default Destructor */
-    ~GCKernelLibrary();
-    /** Prevent instances of this class from being copied */
-    GCKernelLibrary(const GCKernelLibrary &) = delete;
-    /** Prevent instances of this class from being copied */
-    const GCKernelLibrary &operator=(const GCKernelLibrary &) = delete;
-    /** Get the static instance of @ref GCKernelLibrary.
-     * This method has been deprecated and will be removed in the next release.
-     * @return The static instance of GCKernelLibrary.
-     */
-    static GCKernelLibrary &get();
-    /** Initialises the kernel library.
-     *
-     * @param[in] shader_path (Optional) Path of the directory from which shader sources are loaded.
-     * @param[in] dpy         (Optional) EGLdisplay set by external application.
-     * @param[in] ctx         (Optional) EGLContext set by external application.
-     */
-    void init(std::string shader_path = "./", EGLDisplay dpy = EGL_NO_DISPLAY, EGLContext ctx = EGL_NO_CONTEXT);
-    /** Sets the path that the shaders reside in.
-     *
-     * @param[in] shader_path Path of the shader.
-     */
-    void set_shader_path(const std::string &shader_path);
-    /** Sets display and context to create kernel.
-     *
-     * @param[in] dpy EGLdisplay set by external application.
-     * @param[in] ctx EGLContext set by external application.
-     */
-    void set_context(EGLDisplay dpy, EGLContext ctx);
-    /** Creates a kernel from the kernel library.
-     *
-     * @param[in] shader_name       Shader name.
-     * @param[in] build_options_set Shader build options as a set.
-     *
-     * @return The created kernel.
-     */
-    GCKernel create_kernel(const std::string &shader_name, const StringSet &build_options_set = {}) const;
-    /** Serializes and saves programs to a binary. */
-    void save_binary();
-    /** Load serialized binary with all the programs. */
-    void load_binary();
-    /** Setup a dummy fbo to workaround an issue on Galaxy S8. */
-    void setup_dummy_fbo();
-
-private:
-    /** Preprocess GLES shader
-     *
-     * @param[in] shader_source Source code of the shader to preprocess.
-     *
-     * @return Preprocessed GLES shader object.
-     */
-    std::string preprocess_shader(const std::string &shader_source) const;
-    /** Load program and its dependencies.
-     *
-     * @param[in] program_name Name of the program to load.
-     */
-    const GCProgram &load_program(const std::string &program_name) const;
-    /** Concatenates contents of a set into a single string.
-     *
-     * @param[in] s Input set to concatenate.
-     *
-     * @return Concatenated string.
-     */
-    std::string stringify_set(const StringSet &s) const;
-
-    EGLDisplay  _display;                                                /**< Underlying EGL Display. */
-    EGLContext  _context;                                                /**< Underlying EGL Context. */
-    GLuint      _frame_buffer;                                           /**< Dummy fbo */
-    GLuint      _tex_rt;                                                 /**< Dummy texture for render target */
-    std::string _shader_path;                                            /**< Path to the shaders folder. */
-    mutable std::map<std::string, const GCProgram>  _programs_map;       /**< Map with all already loaded program data. */
-    mutable std::map<std::string, const GCKernel>   _built_programs_map; /**< Map with all already built program data. */
-    static const std::map<std::string, std::string> _shader_program_map; /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
-                                                                              Used for compile-time shader inclusion. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCKERNELLIBRARY_H */
diff --git a/arm_compute/core/GLES_COMPUTE/GCKernels.h b/arm_compute/core/GLES_COMPUTE/GCKernels.h
deleted file mode 100644
index a1537ec152..0000000000
--- a/arm_compute/core/GLES_COMPUTE/GCKernels.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCKERNELS_H
-#define ARM_COMPUTE_GCKERNELS_H
-
-/* Header regrouping all the GLES compute kernels */
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h"
-
-#endif /* ARM_COMPUTE_GCKERNELS_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCKernel.h b/arm_compute/core/GLES_COMPUTE/IGCKernel.h
deleted file mode 100644
index 7b2aad7cec..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCKernel.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCKERNEL_H
-#define ARM_COMPUTE_IGCKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/GPUTarget.h"
-
-#include "arm_compute/core/IKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-class Window;
-
-/** Common interface for all the GLES kernels */
-class IGCKernel : public IKernel
-{
-public:
-    /** Constructor */
-    IGCKernel();
-    /** Returns a reference to the GLES kernel of this object.
-     *
-     * @return A reference to the GLES kernel of this object.
-     */
-    GCKernel &kernel();
-
-    /** Add the passed 1D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    void add_1D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Add the passed 2D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    void add_2D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Add the passed 3D tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    void add_3D_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Returns the number of arguments enqueued per 1D tensor object.
-     *
-     * @return The number of arguments enqueues per 1D tensor object.
-     */
-    unsigned int num_arguments_per_1D_tensor() const;
-    /** Returns the number of arguments enqueued per 2D tensor object.
-     *
-     * @return The number of arguments enqueues per 2D tensor object.
-     */
-    unsigned int num_arguments_per_2D_tensor() const;
-    /** Returns the number of arguments enqueued per 3D tensor object.
-     *
-     * @return The number of arguments enqueues per 3D tensor object.
-     */
-    unsigned int num_arguments_per_3D_tensor() const;
-    /** Enqueue the OpenGL ES shader to process the given window
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    virtual void run(const Window &window) = 0;
-
-    /** Set the Local-Workgroup-Size hint
-     *
-     * @note This method should be called after the configuration of the kernel
-     *
-     * @param[in] lws_hint Local-Workgroup-Size to use
-     */
-    void set_lws_hint(gles::NDRange &lws_hint)
-    {
-        _lws_hint = lws_hint;
-    }
-
-    /** Set the targeted GPU architecture
-     *
-     * @param[in] target The targeted GPU architecture
-     */
-    void set_target(GPUTarget target)
-    {
-        _target = target;
-    }
-
-    /** Get the targeted GPU architecture
-     *
-     * @return The targeted GPU architecture.
-     */
-    GPUTarget get_target() const
-    {
-        return _target;
-    }
-
-private:
-    /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
-     *
-     * @param[in] idx           Index at which to start adding the tensor's arguments.Input and output tensor will have sperated index, multiple indices start from 1, single index have to be set to 0.
-     * @param[in] tensor        Tensor to set as an argument of the object's kernel.
-     * @param[in] binding_point Tensor's binding point in this kernel.
-     * @param[in] window        Window the kernel will be executed on.
-     */
-    template <unsigned int dimension_size>
-    void add_tensor_argument(unsigned int &idx, const IGCTensor *tensor, const unsigned int binding_point, const Window &window);
-
-    /** Returns the number of arguments enqueued per tensor object.
-     *
-     * @return The number of arguments enqueued per tensor object.
-     */
-    template <unsigned int dimension_size>
-    unsigned int           num_arguments_per_tensor() const;
-
-protected:
-    GCKernel      _kernel;   /**< GLES kernel to run */
-    gles::NDRange _lws_hint; /**< Local workgroup size hint for the GLES kernel */
-    GPUTarget     _target;   /**< The targeted GPU */
-};
-
-/** Add the kernel to the command queue with the given window.
- *
- * @note Depending on the size of the window, this might translate into several jobs being enqueued.
- *
- * @note If kernel->kernel() is empty then the function will return without adding anything to the queue.
- *
- * @param[in] kernel Kernel to enqueue
- * @param[in] window Window the kernel has to process.
- * @param[in] lws    Local workgroup size requested, by default (1, 1, 1)
- *
- * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
- */
-void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws = gles::NDRange(1U, 1U, 1U));
-}
-#endif /*ARM_COMPUTE_IGCKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
deleted file mode 100644
index ae8fd40888..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCSIMPLE2DKERNEL_H
-#define ARM_COMPUTE_IGCSIMPLE2DKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output. This interface can be used when the work-item processes a 2D tile */
-class IGCSimple2DKernel : public IGCSimpleKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /*ARM_COMPUTE_IGCSIMPLE2DKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
deleted file mode 100644
index 40a21ee147..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCSIMPLE3DKERNEL_H
-#define ARM_COMPUTE_IGCSIMPLE3DKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for simple GLES kernels having 1 tensor input and 1 tensor output.
- *  Both input tensor and output tensor must have at least 3 dimensions.
- */
-class IGCSimple3DKernel : public IGCSimple2DKernel
-{
-public:
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /*ARM_COMPUTE_IGCSIMPLE3DKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h b/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
deleted file mode 100644
index c0f561ab5d..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCSimpleKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCSIMPLEKERNEL_H
-#define ARM_COMPUTE_IGCSIMPLEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/core/Helpers.h"
-
-namespace arm_compute
-{
-/** Interface for simple OpenGL ES kernels having 1 tensor input and 1 tensor output */
-class IGCSimpleKernel : public IGCKernel
-{
-public:
-    /** Constructor. */
-    IGCSimpleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCSimpleKernel(const IGCSimpleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCSimpleKernel &operator=(const IGCSimpleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    IGCSimpleKernel(IGCSimpleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    IGCSimpleKernel &operator=(IGCSimpleKernel &&) = default;
-    /** Default destructor */
-    ~IGCSimpleKernel() = default;
-
-    /** Configure the kernel
-     *
-     * @param[in]  input                             Source tensor.
-     * @param[out] output                            Destination tensor.
-     * @param[in]  num_elems_processed_per_iteration Number of processed elements per iteration.
-     * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  border_size                       (Optional) Size of the border.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
-
-protected:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-};
-}
-
-#endif /*ARM_COMPUTE_IGCSIMPLEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/IGCTensor.h b/arm_compute/core/GLES_COMPUTE/IGCTensor.h
deleted file mode 100644
index c382095846..0000000000
--- a/arm_compute/core/GLES_COMPUTE/IGCTensor.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IGCTENSOR_H
-#define ARM_COMPUTE_IGCTENSOR_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/ITensor.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Interface for GLES Compute tensor */
-class IGCTensor : public ITensor
-{
-public:
-    /** Default constructor. */
-    IGCTensor();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCTensor(const IGCTensor &) = delete;
-
-    /** Prevent instances of this class from being copy assigned (As this class contains pointers) */
-    IGCTensor &operator=(const IGCTensor &) = delete;
-
-    /** Allow instances of this class to be moved */
-    IGCTensor(IGCTensor &&) = default;
-
-    /** Allow instances of this class to be moved */
-    IGCTensor &operator=(IGCTensor &&) = default;
-
-    /** Virtual destructor */
-    virtual ~IGCTensor() = default;
-
-    /** Map on an allocated buffer.
-     *
-     * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    void map(bool blocking = true);
-    /** Unmap an allocated and mapped buffer.
-     */
-    void unmap();
-    /** Clear the contents of the tensor synchronously.
-     */
-    void clear();
-
-    // Inherited methods overridden:
-    uint8_t *buffer() const override;
-    /** Interface to be implemented by the child class to return the tensor's gles compute buffer id.
-      *
-      * @return A SSBO buffer id.
-     */
-    virtual GLuint gc_buffer() const = 0;
-
-    /** Flag indicating whether the tensor has been left aligned by a kernel and therefore needs shifting.
-     *
-     * @return True if the tensor is left aligned.
-     */
-    bool needs_shifting() const;
-    /** Set the flag indicating whether or not a tensor needs shifting.
-     *
-     * @param[in] needs_shifting Indicates if the tensor is left aligned or not.
-     *
-     */
-    void set_needs_shifting(bool needs_shifting);
-
-protected:
-    /** Method to be implemented by the child class to map the SSBO.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     */
-    virtual uint8_t *do_map(bool blocking) = 0;
-    /** Method to be implemented by the child class to unmap the SSBO.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     */
-    virtual void do_unmap() = 0;
-
-private:
-    uint8_t *_mapping;
-    bool     _needs_shifting;
-};
-
-/** Interface for GLES Compute image */
-using IGCImage = IGCTensor;
-}
-#endif /*ARM_COMPUTE_IGCTENSOR_H */
diff --git a/arm_compute/core/GLES_COMPUTE/OpenGLES.h b/arm_compute/core/GLES_COMPUTE/OpenGLES.h
deleted file mode 100644
index 445443602d..0000000000
--- a/arm_compute/core/GLES_COMPUTE/OpenGLES.h
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_OPENGLES_H
-#define ARM_COMPUTE_OPENGLES_H
-
-#include "arm_compute/core/Log.h"
-
-#include <EGL/egl.h>
-#include <EGL/eglext.h>
-#include <EGL/eglplatform.h>
-#include <GLES3/gl31.h>
-#include <GLES3/gl3ext.h>
-#include <cstddef>
-
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-#define ARM_COMPUTE_GL_CHECK(x)                                                                      \
-    x;                                                                                               \
-    {                                                                                                \
-        GLenum error = glGetError();                                                                 \
-        if(error != GL_NO_ERROR)                                                                     \
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("glGetError() = %i (0x%.8x)\n", error, error); \
-    }
-#else /* ARM_COMPUTE_DEBUG_ENABLED */
-#define ARM_COMPUTE_GL_CHECK(x) x
-#endif /* ARM_COMPUTE_DEBUG_ENABLED */
-
-namespace arm_compute
-{
-namespace gles
-{
-/** Class interface for specifying NDRange values. */
-class NDRange
-{
-private:
-    size_t _sizes[3];
-    size_t _dimensions;
-
-public:
-    /** Default constructor - resulting range has zero dimensions. */
-    NDRange()
-        : _dimensions(0)
-    {
-        _sizes[0] = 0;
-        _sizes[1] = 0;
-        _sizes[2] = 0;
-    }
-
-    /** Constructs one-dimensional range.
-     *
-     * @param[in] size0 Size of the first dimension.
-     */
-    NDRange(size_t size0)
-        : _dimensions(1)
-    {
-        _sizes[0] = size0;
-        _sizes[1] = 1;
-        _sizes[2] = 1;
-    }
-
-    /** Constructs two-dimensional range.
-     *
-     * @param[in] size0 Size of the first dimension.
-     * @param[in] size1 Size of the second dimension.
-     */
-    NDRange(size_t size0, size_t size1)
-        : _dimensions(2)
-    {
-        _sizes[0] = size0;
-        _sizes[1] = size1;
-        _sizes[2] = 1;
-    }
-
-    /** Constructs three-dimensional range.
-     *
-     * @param[in] size0 Size of the first dimension.
-     * @param[in] size1 Size of the second dimension.
-     * @param[in] size2 Size of the third dimension.
-     */
-    NDRange(size_t size0, size_t size1, size_t size2)
-        : _dimensions(3)
-    {
-        _sizes[0] = size0;
-        _sizes[1] = size1;
-        _sizes[2] = size2;
-    }
-
-    /** Conversion operator to const size_t *.
-     *
-     *  @returns A pointer to the size of the first dimension.
-     */
-    operator const size_t *() const
-    {
-        return _sizes;
-    }
-
-    /** Queries the number of dimensions in the range.
-     *
-     * @returns The number of dimensions.
-    */
-    size_t dimensions() const
-    {
-        return _dimensions;
-    }
-
-    /** Returns the size of the object in bytes based on the runtime number of dimensions
-     *
-     * @returns The size of the object in bytes.
-     */
-    size_t size() const
-    {
-        return _dimensions * sizeof(size_t);
-    }
-
-    /** Returns the sizes array for each dimensions.
-     *
-     * @returns The sizes array
-     */
-    size_t *get()
-    {
-        return _sizes;
-    }
-
-    /** Returns the sizes array for each dimensions.
-     *
-     * @returns The sizes array
-     */
-    const size_t *get() const
-    {
-        return _sizes;
-    }
-};
-
-static const NDRange NullRange;
-static const NDRange Range_128_1 = NDRange(128, 1);
-} // namespace gles
-
-/** Check if the OpenGL ES 3.1 API is available at runtime.
- *
- *  @returns true if the OpenGL ES 3.1 API is available.
- */
-bool opengles31_is_available();
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_OPENGLES_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
deleted file mode 100644
index d55f98fa66..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the absolute difference kernel.
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class GCAbsoluteDifferenceKernel : public IGCKernel
-{
-public:
-    /** Default constructor. */
-    GCAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCAbsoluteDifferenceKernel(const GCAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCAbsoluteDifferenceKernel &operator=(const GCAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCAbsoluteDifferenceKernel(GCAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCAbsoluteDifferenceKernel &operator=(GCAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~GCAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output images.
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8
-     * @param[in]  input2 Source tensor. Data types supported: U8
-     * @param[out] output Destination tensor. Data types supported: U8
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input1; /**< Source tensor 1. */
-    const IGCTensor *_input2; /**< Source tensor 2. */
-    IGCTensor       *_output; /**< Destination tensor. */
-};
-}
-#endif /* ARM_COMPUTE_GCABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
deleted file mode 100644
index 65e018a50a..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCActivationLayerKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class IGCTensor;
-class GCCoreRuntimeContext;
-
-/** Interface for the activation layer kernel. */
-class GCActivationLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor
-     *
-     * @param[in, out] ctx Core context to use
-     */
-    explicit GCActivationLayerKernel(GCCoreRuntimeContext *ctx = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCActivationLayerKernel(const GCActivationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCActivationLayerKernel &operator=(const GCActivationLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCActivationLayerKernel(GCActivationLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCActivationLayerKernel &operator=(GCActivationLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCActivationLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: F16/F32.
-     * @param[out]     output   Destination tensor. Data type should match the input data type.
-     * @param[in]      act_info Activation layer information.
-     */
-    void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    IGCTensor            *_input;
-    IGCTensor            *_output;
-    GCCoreRuntimeContext *_ctx;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCACTIVATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h
deleted file mode 100644
index 7e8159c638..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCArithmeticAdditionKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCARITHMETICADDITIONKERNEL_H
-#define ARM_COMPUTE_GCARITHMETICADDITIONKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the arithmetic addition kernel
- *
- * Arithmetic addition is computed by:
- * @f[ output(x,y) = input1(x,y) + input2(x,y) @f]
- */
-class GCArithmeticAdditionKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCArithmeticAdditionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCArithmeticAdditionKernel(const GCArithmeticAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCArithmeticAdditionKernel &operator=(const GCArithmeticAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCArithmeticAdditionKernel(GCArithmeticAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCArithmeticAdditionKernel &operator=(GCArithmeticAdditionKernel &&) = default;
-    /** Default destructor */
-    ~GCArithmeticAdditionKernel() = default;
-    /** Initialise the kernel's inputs, output and convertion policy.
-     *
-     * @param[in]  input1 First tensor input. Data types supported: F16.
-     * @param[in]  input2 Second tensor input. Data types supported: F16.
-     * @param[out] output Output tensor. Data types supported: F16.
-     * @param[in]  policy Policy to use to handle overflow.
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref GCArithmeticAdditionKernel
-     *
-     * @param[in]  input1 First tensor input info. Data types supported: F16.
-     * @param[in]  input2 Second tensor input info. Data types supported: F16.
-     * @param[out] output Output tensor info. Data types supported: F16.
-     * @param[in]  policy Policy to use to handle overflow.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input1; /**< Source tensor 1 */
-    const IGCTensor *_input2; /**< Source tensor 2 */
-    IGCTensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCARITHMETICADDITIONKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
deleted file mode 100644
index eb7a99c59e..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the BatchNormalization layer kernel.
- */
-class GCBatchNormalizationLayerKernel : public IGCKernel
-{
-public:
-    /** Constructor */
-    GCBatchNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCBatchNormalizationLayerKernel(const GCBatchNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCBatchNormalizationLayerKernel &operator=(const GCBatchNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    GCBatchNormalizationLayerKernel(GCBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    GCBatchNormalizationLayerKernel &operator=(GCBatchNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCBatchNormalizationLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input    Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                      The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[out] output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]  mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]  gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]  epsilon  (optional) Small value to avoid division with zero.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta = nullptr, const IGCTensor *gamma = nullptr, float epsilon = 0.001f,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCBatchNormalizationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
-     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    const IGCTensor *_mean;
-    const IGCTensor *_var;
-    const IGCTensor *_beta;
-    const IGCTensor *_gamma;
-    float            _epsilon;
-};
-}
-#endif /*ARM_COMPUTE_GCBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
deleted file mode 100644
index d96fb56771..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCCOL2IMKERNEL_H
-#define ARM_COMPUTE_GCCOL2IMKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the col2im reshaping kernel.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref GCIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class GCCol2ImKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCCol2ImKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCCol2ImKernel(const GCCol2ImKernel &) = delete;
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCCol2ImKernel &operator=(const GCCol2ImKernel &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCCol2ImKernel(GCCol2ImKernel &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCCol2ImKernel &operator=(GCCol2ImKernel &&) = default;
-
-    /** Default destructor */
-    ~GCCol2ImKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: F16/F32
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in]  convolved_dims Output convolved dimensions.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, std::pair<unsigned int, unsigned int> convolved_dims);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-};
-}
-
-#endif /*ARM_COMPUTE_GCCOL2IMKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
deleted file mode 100644
index 9c7754947a..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class GCDepthConcatenateLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthConcatenateLayerKernel(const GCDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthConcatenateLayerKernel &operator=(const GCDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCDepthConcatenateLayerKernel(GCDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCDepthConcatenateLayerKernel &operator=(GCDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const IGCTensor *input, unsigned int depth_offset, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    int              _depth_offset;
-};
-}
-#endif /* ARM_COMPUTE_GCDEPTHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 8faa54a205..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_GCDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor.
- */
-class GCDepthwiseConvolutionLayer3x3Kernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDepthwiseConvolutionLayer3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthwiseConvolutionLayer3x3Kernel(const GCDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDepthwiseConvolutionLayer3x3Kernel &operator=(const GCDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    GCDepthwiseConvolutionLayer3x3Kernel(GCDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    GCDepthwiseConvolutionLayer3x3Kernel &operator=(GCDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in]  input            Source tensor. DataType supported: F16.
-     * @param[in]  weights          Weights tensor. A 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
-     * @param[in]  biases           (Optional) Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize       _border_size;
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    const IGCTensor *_weights;
-    const IGCTensor *_biases;
-    unsigned int     _conv_stride_x;
-    unsigned int     _conv_stride_y;
-    unsigned int     _conv_pad_left;
-    unsigned int     _conv_pad_top;
-    gles::NDRange    _lws;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
deleted file mode 100644
index 43f94f8662..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the direct convolution kernel.
- */
-template <unsigned int kernel_size>
-class GCDirectConvolutionLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDirectConvolutionLayerKernel(const GCDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDirectConvolutionLayerKernel &operator=(const GCDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCDirectConvolutionLayerKernel(GCDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCDirectConvolutionLayerKernel &operator=(GCDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCDirectConvolutionLayerKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input     The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  bias      Biases tensor. Shared bias supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[out] output    The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                       while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_bias;
-    const IGCTensor *_weights;
-    IGCTensor       *_output;
-    BorderSize       _border_size;
-    int              _conv_stride_x;
-    int              _conv_stride_y;
-    int              _conv_pad_x;
-    int              _conv_pad_y;
-    gles::NDRange    _lws;
-};
-
-/** Interface for the 1x1 direct convolution kernel */
-using GCDirectConvolutionLayer1x1Kernel = GCDirectConvolutionLayerKernel<1>;
-/** Interface for the 3x3 direct convolution kernel */
-using GCDirectConvolutionLayer3x3Kernel = GCDirectConvolutionLayerKernel<3>;
-/** Interface for the 5x5 direct convolution kernel */
-using GCDirectConvolutionLayer5x5Kernel = GCDirectConvolutionLayerKernel<5>;
-}
-#endif /*ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
deleted file mode 100644
index e3dda67a8a..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H
-#define ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the dropout layer kernel.
- *
- * Dropout is used to improve over-fit on neural networks.
- *
- */
-class GCDropoutLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCDropoutLayerKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDropoutLayerKernel(const GCDropoutLayerKernel &) = delete;
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCDropoutLayerKernel &operator=(const GCDropoutLayerKernel &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCDropoutLayerKernel(GCDropoutLayerKernel &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCDropoutLayerKernel &operator=(GCDropoutLayerKernel &&) = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input   The input tensor for this op. Data types supported: F16/F32
-     * @param[out] mask    The mask tensor. Data types supported: Same as @p input
-     * @param[out] output  The output tensor. Data types supported: Same as @p input
-     * @param[in]  ratio   Dropout ratio
-     * @param[in]  forward Forward or backward propagation
-     *
-     */
-    void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_mask;
-    IGCTensor       *_output;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-}
-
-#endif /*ARM_COMPUTE_GCDROPOUTLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
deleted file mode 100644
index 4dd7aa0ec1..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCFILLBORDERKERNEL_H
-#define ARM_COMPUTE_GCFILLBORDERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for filling the border of a kernel */
-class GCFillBorderKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCFillBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCFillBorderKernel(const GCFillBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCFillBorderKernel &operator=(const GCFillBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCFillBorderKernel(GCFillBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCFillBorderKernel &operator=(GCFillBorderKernel &&) = default;
-    /** Default destructor */
-    ~GCFillBorderKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in,out] tensor                Tensor to process Data types supported: F16/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const IGCTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-
-    /** Function to set the constant value on fill border kernel depending on type.
-     *
-     * @param[in] idx                   Index of the kernel argument to set.
-     * @param[in] constant_border_value Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    template <class T>
-    void set_constant_border(unsigned int idx, const PixelValue &constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    bool is_parallelisable() const override;
-
-private:
-    const IGCTensor *_tensor;
-};
-}
-#endif /*ARM_COMPUTE_GCFILLBORDERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
deleted file mode 100644
index cbc60da443..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H
-#define ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGL ES kernel which interleaves the elements of a matrix A in chunk of 4x4
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
- */
-class GCGEMMInterleave4x4Kernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMInterleave4x4Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMInterleave4x4Kernel(const GCGEMMInterleave4x4Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMInterleave4x4Kernel &operator=(const GCGEMMInterleave4x4Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCGEMMInterleave4x4Kernel(GCGEMMInterleave4x4Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCGEMMInterleave4x4Kernel &operator=(GCGEMMInterleave4x4Kernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16, F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-
-    // Inherited methods overridden
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-};
-}
-#endif /* ARM_COMPUTE_GCGEMMINTERLEAVE4X4KERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index 95f991ee73..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-/** Interface to add a bias to each row of the input tensor
- *
- */
-class GCGEMMMatrixAccumulateBiasesKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAccumulateBiasesKernel(const GCGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAccumulateBiasesKernel &operator=(const GCGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAccumulateBiasesKernel(GCGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAccumulateBiasesKernel &operator=(GCGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
-     */
-    void configure(IGCTensor *accum, const IGCTensor *biases);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    IGCTensor       *_accum;
-    const IGCTensor *_biases;
-    gles::NDRange    _lws;
-};
-}
-
-#endif /*ARM_COMPUTE_GCGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
deleted file mode 100644
index e4157a1327..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H
-#define ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGL ES kernel to perform the in-place matrix addition between 2 matrices, taking into account that the second matrix might be weighted by a scalar value beta.
- *  The matrices must have the same dimensions
- *
- * @note This kernel is computed if and only if beta != 0.0.
- */
-class GCGEMMMatrixAdditionKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMMatrixAdditionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAdditionKernel(const GCGEMMMatrixAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixAdditionKernel &operator=(const GCGEMMMatrixAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAdditionKernel(GCGEMMMatrixAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixAdditionKernel &operator=(GCGEMMMatrixAdditionKernel &&) = default;
-    /** Initialise the kernel's input, output and beta value
-     *
-     * @note The input and output tensors must have the same dimensions
-     *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: F32
-     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result (alpha * AB + beta * C), output must contain the result obtained by @ref GCGEMMMatrixMultiplyKernel. Data type supported: same as @p input
-     * @param[in]      beta   Weight of matrix C
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-};
-}
-
-#endif /* ARM_COMPUTE_GCGEMMMATRIXADDITIONKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 4dcae2e536..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GPUTarget.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** GLES Compute kernel to multiply two input matrices "A" and "B" or to multiply a vector "A" by a matrix "B". All elements of the output matrix/vector will be multiplied by alpha
- *
- * @attention The second input tensor must have at least 2 dimensions (matrix)
- *
- */
-class GCGEMMMatrixMultiplyKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCGEMMMatrixMultiplyKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixMultiplyKernel(const GCGEMMMatrixMultiplyKernel &) = delete;
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMMMatrixMultiplyKernel &operator=(const GCGEMMMatrixMultiplyKernel &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixMultiplyKernel(GCGEMMMatrixMultiplyKernel &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCGEMMMatrixMultiplyKernel &operator=(GCGEMMMatrixMultiplyKernel &&) = default;
-
-    /** Initialise the kernel's input, output and alpha
-     *
-     * @param[in]  input0                    Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in]  input1                    Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                                       If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[out] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in]  alpha                     Weight of the matrix product
-     * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref GCGEMMInterleave4x4Kernel and @ref GCGEMMTranspose1xWKernel
-     * @param[in]  reshape_info              (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     */
-    void configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0                    Input tensor containing the Matrix A. Data types supported: F16/F32
-     * @param[in] input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     * @param[in] alpha                     Weight of the matrix product
-     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref GCGEMMInterleave4x4Kernel and @ref GCGEMMTranspose1xWKernel
-     * @param[in] reshape_info              GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     * @param[in] gpu_target                GPU Target
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info,
-                           GPUTarget gpu_target);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input0;
-    const IGCTensor *_input1;
-    IGCTensor       *_output;
-};
-}
-#endif /* ARM_COMPUTE_GCGEMMMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
deleted file mode 100644
index 29a4c8d209..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H
-#define ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGLES kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data type is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
- *
- */
-class GCGEMMTranspose1xWKernel : public IGCSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16, F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /* ARM_COMPUTE_GCGEMMTRANSPOSE1XWKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
deleted file mode 100644
index 7d1a53c4c3..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCIM2COLKERNEL_H
-#define ARM_COMPUTE_GCIM2COLKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-class Size2D;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class GCIm2ColKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCIm2ColKernel(const GCIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCIm2ColKernel &operator=(const GCIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCIm2ColKernel(GCIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCIm2ColKernel &operator=(GCIm2ColKernel &&) = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
-     * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                         while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32
-     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
-     *                        while every dimension above represents a batch. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U));
-
-private:
-    /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
-     *
-     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     * @param[in,out] queue  Command queue on which to enqueue the kernel.
-     */
-    void run_reduced(const Window &window);
-    /** run the generic convolution layer input reshape kernel
-     *
-     * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     * @param[in,out] queue  Command queue on which to enqueue the kernel.
-     */
-    void run_generic(const Window &window);
-
-    /** Common signature for the kernel to run */
-    using Im2ColFunction = void (GCIm2ColKernel::*)(const Window &);
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    std::pair<unsigned int, unsigned int> _kernel_dims;
-    unsigned int   _num_elems_processed_per_iteration;
-    Im2ColFunction _run_func;
-};
-}
-
-#endif /*ARM_COMPUTE_GCIM2COLKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
deleted file mode 100644
index dd00caecfb..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the normalization layer kernel.
- */
-class GCNormalizationLayerKernel : public IGCKernel
-{
-public:
-    /** Constructor */
-    GCNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizationLayerKernel(const GCNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizationLayerKernel &operator=(const GCNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    GCNormalizationLayerKernel(GCNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    GCNormalizationLayerKernel &operator=(GCNormalizationLayerKernel &&) = default;
-    /** Default destrutor */
-    ~GCNormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: F32.
-     * @param[in]  squared_input Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           Data types should match the input type.
-     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data types should match the input type.
-     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const IGCTensor *input, const IGCTensor *squared_input, IGCTensor *output, NormalizationLayerInfo norm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_squared_input;
-    IGCTensor       *_output;
-    BorderSize       _border_size;
-};
-}
-#endif /*ARM_COMPUTE_GCNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h
deleted file mode 100644
index 5156da8b2c..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTEH
-#define ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the NormalizePlanarYUV layer kernel.
- */
-class GCNormalizePlanarYUVLayerKernel : public IGCKernel
-{
-public:
-    /** Constructor */
-    GCNormalizePlanarYUVLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizePlanarYUVLayerKernel(const GCNormalizePlanarYUVLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCNormalizePlanarYUVLayerKernel &operator=(const GCNormalizePlanarYUVLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    GCNormalizePlanarYUVLayerKernel(GCNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default move assignment operator */
-    GCNormalizePlanarYUVLayerKernel &operator=(GCNormalizePlanarYUVLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCNormalizePlanarYUVLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: F16.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the feature maps [FM].
-     *                    Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std);
-    /** Static function to check if given info will lead to a valid configuration of @ref GCNormalizePlanarYUVLayerKernel
-     *
-     * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: F16.
-     * @param[out] output Destination tensor info. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    const IGCTensor *_mean;
-    const IGCTensor *_std;
-};
-}
-#endif /*ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 0c4b656175..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the pixelwise multiplication kernel.
- *
- */
-class GCPixelWiseMultiplicationKernel : public IGCKernel
-{
-public:
-    /** Default constructor.*/
-    GCPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPixelWiseMultiplicationKernel(const GCPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPixelWiseMultiplicationKernel &operator=(const GCPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCPixelWiseMultiplicationKernel(GCPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCPixelWiseMultiplicationKernel &operator=(GCPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1 An input tensor. Data types supported: F32.
-     * @param[in]  input2 An input tensor. Data types supported: same as @p input1.
-     * @param[out] output The output tensor, Data types supported: same as @p input1.
-     * @param[in]  scale  Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input1;
-    const IGCTensor *_input2;
-    IGCTensor       *_output;
-};
-}
-
-#endif /*ARM_COMPUTE_GCPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
deleted file mode 100644
index 7a2fb84f34..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the pooling layer kernel */
-class GCPoolingLayerKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPoolingLayerKernel(const GCPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCPoolingLayerKernel &operator=(const GCPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCPoolingLayerKernel(GCPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCPoolingLayerKernel &operator=(GCPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~GCPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices = nullptr);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref GCPoolingLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: F16/F32.
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-
-private:
-    const IGCTensor *_input;
-    IGCTensor       *_output;
-    IGCTensor       *_indices;
-    PoolingLayerInfo _pool_info;
-    BorderSize       _border_size;
-    unsigned int     _num_elems_processed_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
deleted file mode 100644
index 754f15cbd8..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCScaleKernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCSCALEKERNEL_H
-#define ARM_COMPUTE_GCSCALEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the scale kernel */
-class GCScaleKernel : public IGCSimple3DKernel
-{
-public:
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @param[in]  input            Source tensor. Data types supported: F16
-     * @param[out] output           Destination tensor. Data types supported: Same as @p input
-     *                              All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  policy           Interpolation type to use
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     * @param[in]  sampling_policy  (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, bool border_undefined, SamplingPolicy sampling_policy = SamplingPolicy::CENTER);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCSCALEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
deleted file mode 100644
index 280efe11f8..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple3DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Interface for the identifying the max value of 1D Logits */
-class GCLogits1DMaxKernel : public IGCSimple3DKernel
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-};
-
-/** Interface for shifting the logits values around the max value and exponentiating the result */
-class GCLogits1DShiftExpSumKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCLogits1DShiftExpSumKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DShiftExpSumKernel(const GCLogits1DShiftExpSumKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DShiftExpSumKernel &operator=(const GCLogits1DShiftExpSumKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCLogits1DShiftExpSumKernel(GCLogits1DShiftExpSumKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCLogits1DShiftExpSumKernel &operator=(GCLogits1DShiftExpSumKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[in]  max    Max values tensor. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[out] sum    Sum of 1D logits tensor. Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, const IGCTensor *max, IGCTensor *output, IGCTensor *sum);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_max;
-    IGCTensor       *_output;
-    IGCTensor       *_sum;
-};
-
-/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */
-class GCLogits1DNormKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCLogits1DNormKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DNormKernel(const GCLogits1DNormKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCLogits1DNormKernel &operator=(const GCLogits1DNormKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCLogits1DNormKernel(GCLogits1DNormKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCLogits1DNormKernel &operator=(GCLogits1DNormKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[in]  sum    Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, const IGCTensor *sum, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_sum;
-    IGCTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCSOFTMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
deleted file mode 100644
index 5243e54daf..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCTENSORSHIFTKERNEL_H
-#define ARM_COMPUTE_GCTENSORSHIFTKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-/** Interface for the kernel to shift valid data on a tensor.
- *
- * For example shifting 3x3 valid data with padding of 1 to right:
- * @f[
- * \left( \begin{array}{ccccc}
- * 0   & 0   & 0   & 0 & 0 \\
- * a00 & a01 & a02 & 0 & 0 \\
- * a10 & a11 & a12 & 0 & 0 \\
- * a20 & a21 & a22 & 0 & 0 \\
- * 0   & 0   & 0   & 0 & 0 \\
- * \end{array} \right)
- * =
- * \left( \begin{array}{ccccc}
- * 0  & 0   & 0   & 0   & 0 \\
- * 0  & a00 & a01 & a02 & 0 \\
- * 0  & a10 & a11 & a12 & 0 \\
- * 0  & a20 & a21 & a22 & 0 \\
- * 0  & 0   & 0   & 0   & 0 \\
- * \end{array} \right)
- * @f]
- */
-class GCTensorShiftKernel : public IGCKernel
-{
-public:
-    /** Default constructor */
-    GCTensorShiftKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCTensorShiftKernel(const GCTensorShiftKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCTensorShiftKernel &operator=(const GCTensorShiftKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCTensorShiftKernel(GCTensorShiftKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCTensorShiftKernel &operator=(GCTensorShiftKernel &&) = default;
-    /** Default destructor */
-    ~GCTensorShiftKernel() = default;
-    /** Set the input of the kernel.
-     *
-     * @param[in,out] input Source tensor. Data types supported: F16/F32
-     */
-    void configure(IGCTensor *input);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    IGCTensor    *_input;
-    gles::NDRange _lws;
-    int           _left_padding;
-};
-}
-#endif /*ARM_COMPUTE_GCTENSORSHIFTKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
deleted file mode 100644
index a981ae6d1f..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCTRANSPOSEKERNEL_H
-#define ARM_COMPUTE_GCTRANSPOSEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCSimple2DKernel.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** OpenGL ES kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class GCTransposeKernel : public IGCSimple2DKernel
-{
-public:
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-};
-}
-#endif /* ARM_COMPUTE_GCTRANSPOSEKERNEL_H */
diff --git a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h b/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h
deleted file mode 100644
index 134346b8da..0000000000
--- a/arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-
-namespace arm_compute
-{
-/** GLES Compute kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref GCIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class GCWeightsReshapeKernel : public IGCKernel
-{
-public:
-    /** Constructor.*/
-    GCWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCWeightsReshapeKernel(const GCWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCWeightsReshapeKernel &operator=(const GCWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    GCWeightsReshapeKernel(GCWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    GCWeightsReshapeKernel &operator=(GCWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~GCWeightsReshapeKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  batches] if unshared. Data types supported: F16, F32
-     * @param[in]  biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, batches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     */
-    void configure(const IGCTensor *input, const IGCTensor *biases, IGCTensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window) override;
-
-private:
-    const IGCTensor *_input;
-    const IGCTensor *_biases;
-    IGCTensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCWEIGHTSRESHAPEKERNEL_H */
-\ No newline at end of file
diff --git a/arm_compute/core/GPUTarget.h b/arm_compute/core/GPUTarget.h
index 4959ee5e8a..b107a52d9f 100644
--- a/arm_compute/core/GPUTarget.h
+++ b/arm_compute/core/GPUTarget.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GPUTARGET_H
-#define ARM_COMPUTE_GPUTARGET_H
+#ifndef ACL_ARM_COMPUTE_CORE_GPUTARGET_H
+#define ACL_ARM_COMPUTE_CORE_GPUTARGET_H
 
-#include "arm_compute/core/Helpers.h"
+#include "support/Traits.h"
 
 #include <string>
 
@@ -33,25 +33,38 @@ namespace arm_compute
 /** Available GPU Targets */
 enum class GPUTarget
 {
-    UNKNOWN       = 0x101,
-    GPU_ARCH_MASK = 0xF00,
-    MIDGARD       = 0x100,
-    BIFROST       = 0x200,
-    VALHALL       = 0x300,
-    T600          = 0x110,
-    T700          = 0x120,
-    T800          = 0x130,
-    G71           = 0x210,
-    G72           = 0x220,
-    G51           = 0x230,
-    G51BIG        = 0x231,
-    G51LIT        = 0x232,
-    G52           = 0x240,
-    G52LIT        = 0x241,
-    G76           = 0x250,
-    G77           = 0x310,
-    TBOX          = 0x320,
-    TODX          = 0x330,
+    UNKNOWN             = 0x101,
+    GPU_ARCH_MASK       = 0xF00,
+    GPU_GENERATION_MASK = 0x0F0,
+    MIDGARD             = 0x100,
+    BIFROST             = 0x200,
+    VALHALL             = 0x300,
+    FIFTHGEN            = 0X400,
+    T600                = 0x110,
+    T700                = 0x120,
+    T800                = 0x130,
+    G71                 = 0x210,
+    G72                 = 0x220,
+    G51                 = 0x221,
+    G51BIG              = 0x222,
+    G51LIT              = 0x223,
+    G31                 = 0x224,
+    G76                 = 0x230,
+    G52                 = 0x231,
+    G52LIT              = 0x232,
+    G77                 = 0x310,
+    G57                 = 0x311,
+    G78                 = 0x320,
+    G68                 = 0x321,
+    G78AE               = 0x330,
+    G710                = 0x340,
+    G610                = 0x341,
+    G510                = 0x342,
+    G310                = 0x343,
+    G715                = 0x350,
+    G615                = 0x351,
+    G720                = 0x410,
+    G620                = 0X411
 };
 
 /** Enable bitwise operations on GPUTarget enumerations */
@@ -104,4 +117,4 @@ inline bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target)
     return target_to_check == target;
 }
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GPUTARGET_H */
+#endif // ACL_ARM_COMPUTE_CORE_GPUTARGET_H
diff --git a/arm_compute/core/HOGInfo.h b/arm_compute/core/HOGInfo.h
deleted file mode 100644
index 3cc472b274..0000000000
--- a/arm_compute/core/HOGInfo.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HOGINFO_H
-#define ARM_COMPUTE_HOGINFO_H
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Store the HOG's metadata */
-class HOGInfo
-{
-public:
-    /** Default constructor */
-    HOGInfo();
-    /** Default destructor */
-    virtual ~HOGInfo() = default;
-    /** Allow instances of this class to be copy constructed */
-    HOGInfo(const HOGInfo &) = default;
-    /** Allow instances of this class to be copied */
-    HOGInfo &operator=(const HOGInfo &) = default;
-    /** Allow instances of this class to be move constructed */
-    HOGInfo(HOGInfo &&) = default;
-    /** Allow instances of this class to be moved */
-    HOGInfo &operator=(HOGInfo &&) = default;
-    /** Constructor
-     *
-     * @param[in] cell_size             Cell size in pixels
-     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
-     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
-     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
-     * @param[in] num_bins              Number of histogram bins for each cell
-     * @param[in] normalization_type    (Optional) Normalization type to use for each block
-     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
-     * @param[in] phase_type            (Optional) Type of @ref PhaseType
-     */
-    HOGInfo(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
-            HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
-    /** Initialize the metadata structure with the given parameters
-     *
-     * @param[in] cell_size             Cell size in pixels
-     * @param[in] block_size            Block size in pixels. Must be a multiple of cell_size.
-     * @param[in] detection_window_size Detection window size in pixels. Must be a multiple of block_size and block_stride.
-     * @param[in] block_stride          Distance in pixels between 2 consecutive blocks along the x and y direction. Must be a multiple of cell size
-     * @param[in] num_bins              Number of histogram bins for each cell
-     * @param[in] normalization_type    (Optional) Normalization type to use for each block
-     * @param[in] l2_hyst_threshold     (Optional) Threshold used for L2HYS_NORM normalization method
-     * @param[in] phase_type            (Optional) Type of @ref PhaseType
-     */
-    void init(const Size2D &cell_size, const Size2D &block_size, const Size2D &detection_window_size, const Size2D &block_stride, size_t num_bins,
-              HOGNormType normalization_type = HOGNormType::L2HYS_NORM, float l2_hyst_threshold = 0.2f, PhaseType phase_type = PhaseType::UNSIGNED);
-    /** The cell size in pixels
-     *
-     * @return The cell size in pixels
-     */
-    const Size2D &cell_size() const;
-    /** The block size in pixels
-     *
-     * @return The block size in pixels
-     */
-    const Size2D &block_size() const;
-    /** The detection window size in pixels
-     *
-     * @return The detection window size in pixels
-     */
-    const Size2D &detection_window_size() const;
-    /** The block stride in pixels. The block stride is the distance between 2 consecutive blocks
-     *
-     * @return The block stride in pixels
-     */
-    const Size2D &block_stride() const;
-    /** The number of histogram bins for each cell
-     *
-     * @return The number of histogram bins for each cell
-     */
-    size_t num_bins() const;
-    /** The normalization type
-     *
-     * @return The normalization type
-     */
-    HOGNormType normalization_type() const;
-    /** Threshold used for L2HYS_NORM normalization type
-     *
-     * @return Threshold used for L2HYS_NORM normalization type
-     */
-    float l2_hyst_threshold() const;
-    /** The type of @ref PhaseType
-     *
-     * @return The type of @ref PhaseType
-     */
-    PhaseType phase_type() const;
-    /** The size of HOG descriptor
-     *
-     * @return The size of HOG descriptor
-     */
-    size_t descriptor_size() const;
-    /** Calculates the number of cells for each block
-     *
-     * @return The Size2D data object which stores the number of cells along the x and y directions
-     */
-    Size2D num_cells_per_block() const;
-
-    /** Calculates the number of cells per block stride
-     *
-     * @return The Size2D data object which stores the number of cells per block stride along the x and y directions
-     */
-    Size2D num_cells_per_block_stride() const;
-    /** Calculates the number of block positions for the given image size
-     *
-     * @param[in] image_size The input image size data object
-     *
-     * @return The Size2D data object which stores the number of block positions along the x and y directions
-     */
-    Size2D num_block_positions_per_image(const Size2D &image_size) const;
-
-private:
-    Size2D      _cell_size;
-    Size2D      _block_size;
-    Size2D      _detection_window_size;
-    Size2D      _block_stride;
-    size_t      _num_bins;
-    HOGNormType _normalization_type;
-    float       _l2_hyst_threshold;
-    PhaseType   _phase_type;
-    size_t      _descriptor_size;
-};
-}
-#endif /*ARM_COMPUTE_HOGINFO_H */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 09c672ecfa..960201510a 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,17 @@
 #ifndef ARM_COMPUTE_HELPERS_H
 #define ARM_COMPUTE_HELPERS_H
 
-#include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Steps.h"
-#include "arm_compute/core/Strides.h"
-#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "support/MemorySupport.h"
 
 #include <array>
 #include <cstddef>
 #include <cstdint>
-#include <memory>
 #include <tuple>
-#include <type_traits>
-#include <utility>
 
 namespace arm_compute
 {
@@ -48,307 +42,6 @@ class IKernel;
 class ITensor;
 class ITensorInfo;
 
-/** Disable bitwise operations by default */
-template <typename T>
-struct enable_bitwise_ops
-{
-    static constexpr bool value = false; /**< Disabled */
-};
-
-#ifndef DOXYGEN_SKIP_THIS
-template <typename T>
-typename std::enable_if<enable_bitwise_ops<T>::value, T>::type operator&(T lhs, T rhs)
-{
-    using underlying_type = typename std::underlying_type<T>::type;
-    return static_cast<T>(static_cast<underlying_type>(lhs) & static_cast<underlying_type>(rhs));
-}
-#endif /* DOXYGEN_SKIP_THIS */
-
-/** Helper function to create and return a unique_ptr pointed to a CL/GLES kernel object
- *  It also calls the kernel's configuration.
- *
- * @param[in] args All the arguments that need pass to kernel's configuration.
- *
- * @return A unique pointer pointed to a CL/GLES kernel object
- */
-template <typename Kernel, typename... T>
-std::unique_ptr<Kernel> create_configure_kernel(T &&... args)
-{
-    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
-    k->configure(std::forward<T>(args)...);
-    return k;
-}
-
-/** Helper function to create and return a unique_ptr pointed to a CL/GLES kernel object
- *
- * @return A unique pointer pointed to a Kernel kernel object
- */
-template <typename Kernel>
-std::unique_ptr<Kernel> create_kernel()
-{
-    std::unique_ptr<Kernel> k = arm_compute::support::cpp14::make_unique<Kernel>();
-    return k;
-}
-
-namespace traits
-{
-/** Check if a type T is contained in a tuple Tuple of types */
-template <typename T, typename Tuple>
-struct is_contained;
-
-template <typename T>
-struct is_contained<T, std::tuple<>> : std::false_type
-{
-};
-
-template <typename T, typename... Ts>
-struct is_contained<T, std::tuple<T, Ts...>> : std::true_type
-{
-};
-
-template <typename T, typename U, typename... Ts>
-struct is_contained<T, std::tuple<U, Ts...>> : is_contained<T, std::tuple<Ts...>>
-{
-};
-}
-
-/** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-template <typename T>
-inline T delta_bilinear_c1(const T *pixel_ptr, size_t stride, float dx, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const T a00 = *pixel_ptr;
-    const T a01 = *(pixel_ptr + 1);
-    const T a10 = *(pixel_ptr + stride);
-    const T a11 = *(pixel_ptr + stride + 1);
-
-    const float w1 = dx1 * dy1;
-    const float w2 = dx * dy1;
-    const float w3 = dx1 * dy;
-    const float w4 = dx * dy;
-
-    return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
-}
-
-/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8 and in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- * @param[in] iq_info   Input QuantizationInfo
- * @param[in] oq_info   Output QuantizationInfo
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy, UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = dequantize_qasymm8(*pixel_ptr, iq_info);
-    const float a01 = dequantize_qasymm8(*(pixel_ptr + 1), iq_info);
-    const float a10 = dequantize_qasymm8(*(pixel_ptr + stride), iq_info);
-    const float a11 = dequantize_qasymm8(*(pixel_ptr + stride + 1), iq_info);
-
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-    return static_cast<uint8_t>(quantize_qasymm8(res, oq_info));
-}
-
-/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be QASYMM8_SIGNED and in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- * @param[in] iq_info   Input QuantizationInfo
- * @param[in] oq_info   Output QuantizationInfo
- *
- * @note dx and dy must be in the range [0, 1.0]
- *
- * @return The bilinear interpolated pixel value
- */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy, UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dx1 = 1.0f - dx;
-    const float dy1 = 1.0f - dy;
-
-    const float a00 = dequantize_qasymm8_signed(*pixel_ptr, iq_info);
-    const float a01 = dequantize_qasymm8_signed(*(pixel_ptr + 1), iq_info);
-    const float a10 = dequantize_qasymm8_signed(*(pixel_ptr + stride), iq_info);
-    const float a11 = dequantize_qasymm8_signed(*(pixel_ptr + stride + 1), iq_info);
-
-    const float w1  = dx1 * dy1;
-    const float w2  = dx * dy1;
-    const float w3  = dx1 * dy;
-    const float w4  = dx * dy;
-    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
-    return static_cast<int8_t>(quantize_qasymm8_signed(res, oq_info));
-}
-
-/** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the top pixel value of a single channel input.
- * @param[in] stride    Stride to access the bottom pixel value
- * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
- *
- * @note dy must be in the range [0, 1.0]
- *
- * @return The linear interpolated pixel value
- */
-template <typename T>
-inline T delta_linear_c1_y(const T *pixel_ptr, size_t stride, float dy)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const float dy1 = 1.0f - dy;
-
-    const T a00 = *pixel_ptr;
-    const T a10 = *(pixel_ptr + stride);
-
-    const float w1 = dy1;
-    const float w3 = dy;
-
-    return static_cast<T>(a00 * w1 + a10 * w3);
-}
-/** Computes linear interpolation using the pointer to the left pixel and the pixel's distance between
- * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
- *
- * @param[in] pixel_ptr Pointer to the left pixel value of a single channel input.
- * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
- *
- * @note dx must be in the range [0, 1.0]
- *
- * @return The linear interpolated pixel value
- */
-template <typename T>
-inline T delta_linear_c1_x(const T *pixel_ptr, float dx)
-{
-    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
-
-    const T a00 = *pixel_ptr;
-    const T a01 = *(pixel_ptr + 1);
-
-    const float dx1 = 1.0f - dx;
-
-    const float w1 = dx1;
-    const float w2 = dx;
-
-    return static_cast<T>(a00 * w1 + a01 * w2);
-}
-/** Return the pixel at (x,y) using bilinear interpolation.
- *
- * @warning Only works if the iterator was created with an IImage
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel input.
- * @param[in] stride          Stride in bytes of the image;
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using bilinear interpolation.
- */
-template <typename T>
-inline T pixel_bilinear_c1(const T *first_pixel_ptr, size_t stride, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    const int32_t xi = std::floor(x);
-    const int32_t yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    return delta_bilinear_c1(first_pixel_ptr + xi + yi * stride, stride, dx, dy);
-}
-
-/** Return the pixel at (x,y) using bilinear interpolation by clamping when out of borders. The image must be single channel input
- *
- * @warning Only works if the iterator was created with an IImage
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel image.
- * @param[in] stride          Stride in bytes of the image
- * @param[in] width           Width of the image
- * @param[in] height          Height of the image
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using bilinear interpolation.
- */
-template <typename T>
-inline uint8_t pixel_bilinear_c1_clamp(const T *first_pixel_ptr, size_t stride, size_t width, size_t height, float x, float y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    x = std::max(-1.f, std::min(x, static_cast<float>(width)));
-    y = std::max(-1.f, std::min(y, static_cast<float>(height)));
-
-    const float xi = std::floor(x);
-    const float yi = std::floor(y);
-
-    const float dx = x - xi;
-    const float dy = y - yi;
-
-    if(dx == 0.0f)
-    {
-        if(dy == 0.0f)
-        {
-            return static_cast<T>(first_pixel_ptr[static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride]);
-        }
-        return delta_linear_c1_y(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dy);
-    }
-    if(dy == 0.0f)
-    {
-        return delta_linear_c1_x(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, dx);
-    }
-    return delta_bilinear_c1(first_pixel_ptr + static_cast<int32_t>(xi) + static_cast<int32_t>(yi) * stride, stride, dx, dy);
-}
-
-/** Return the pixel at (x,y) using area interpolation by clamping when out of borders. The image must be single channel U8
- *
- * @note The interpolation area depends on the width and height ration of the input and output images
- * @note Currently average of the contributing pixels is calculated
- *
- * @param[in] first_pixel_ptr Pointer to the first pixel of a single channel U8 image.
- * @param[in] stride          Stride in bytes of the image
- * @param[in] width           Width of the image
- * @param[in] height          Height of the image
- * @param[in] wr              Width ratio among the input image width and output image width.
- * @param[in] hr              Height ratio among the input image height and output image height.
- * @param[in] x               X position of the wanted pixel
- * @param[in] y               Y position of the wanted pixel
- *
- * @return The pixel at (x, y) using area interpolation.
- */
-inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y);
-
 /** Iterator updated by @ref execute_window_loop for each window element */
 class Iterator
 {
@@ -362,6 +55,16 @@ public:
      */
     Iterator(const ITensor *tensor, const Window &window);
 
+    /** Create a container iterator for the tensor with the specified number of dimensions, stride, buffer pointer and window.
+     *
+     * @param[in] num_dims The number of dimensions.
+     * @param[in] strides  The strides in bytes.
+     * @param[in] buffer   The data buffer.
+     * @param[in] offset   The offset in bytes from the beginning of the buffer to the first element of the tensor.
+     * @param[in] window   The window which will be used to iterate over the tensor.
+     */
+    Iterator(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &window);
+
     /** Increment the iterator along the specified dimension of the step value associated to the dimension.
      *
      * @warning It is the caller's responsibility to call increment(dimension+1) when reaching the end of a dimension, the iterator will not check for overflow.
@@ -376,7 +79,7 @@ public:
      *
      * @return The current position of the iterator in bytes relative to the first element.
      */
-    constexpr int offset() const;
+    constexpr size_t offset() const;
 
     /** Return a pointer to the current pixel.
      *
@@ -393,18 +96,27 @@ public:
     void reset(size_t dimension);
 
 private:
+    /** Initialize a container iterator for the tensor with the specified number of dimensions, stride, buffer pointer and window.
+     *
+     * @param[in] num_dims The number of dimensions.
+     * @param[in] strides  The strides in bytes.
+     * @param[in] buffer   The data buffer.
+     * @param[in] offset   The offset in bytes from the beginning of the buffer to the first element of the tensor.
+     * @param[in] window   The window which will be used to iterate over the tensor.
+     */
+    void initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &window);
+
     uint8_t *_ptr;
 
     class Dimension
     {
     public:
-        constexpr Dimension()
-            : _dim_start(0), _stride(0)
+        constexpr Dimension() : _dim_start(0), _stride(0)
         {
         }
 
-        int _dim_start;
-        int _stride;
+        size_t _dim_start;
+        size_t _stride;
     };
 
     std::array<Dimension, Coordinates::num_max_dimensions> _dims;
@@ -419,180 +131,7 @@ private:
  * @param[in,out] iterators       Tensor iterators which will be updated by this function before calling lambda_function.
  */
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators);
-
-/** Update window and padding size for each of the access patterns.
- *
- * First the window size is reduced based on all access patterns that are not
- * allowed to modify the padding of the underlying tensor. Then the padding of
- * the remaining tensors is increased to match the window.
- *
- * @param[in] win      Window that is used by the kernel.
- * @param[in] patterns Access patterns used to calculate the final window and padding.
- *
- * @return True if the window has been changed. Changes to the padding do not
- *         influence the returned value.
- */
-template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
-{
-    bool window_changed = false;
-
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
-
-    bool padding_changed = false;
-
-    utility::for_each([&](IAccessWindow & w)
-    {
-        padding_changed |= w.update_padding_if_needed(win);
-    },
-    patterns...);
-
-    return window_changed;
-}
-
-/** Calculate the maximum window for a given tensor shape and border setting
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] skip_border  (Optional) If true exclude the border region from the window.
- * @param[in] border_size  (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window for a given tensor shape and border setting
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] skip_border (Optional) If true exclude the border region from the window.
- * @param[in] border_size (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
-{
-    return calculate_max_window(info.valid_region(), steps, skip_border, border_size);
-}
-
-/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] skip_border  (Optional) If true exclude the border region from the window.
- * @param[in] border_size  (Optional) Border size. The border region will be excluded from the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] skip_border (Optional) If true exclude the border region from the window.
- * @param[in] border_size (Optional) Border size.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
-{
-    return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
-}
-
-/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
- *
- * @param[in] valid_region Valid region object defining the shape of the tensor space for which the window is created.
- * @param[in] steps        (Optional) Number of elements processed for each step.
- * @param[in] border_size  (Optional) Border size. The border region will be included in the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
-
-/** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
- *
- * @param[in] info        Tensor info object defining the shape of the object for which the window is created.
- * @param[in] steps       (Optional) Number of elements processed for each step.
- * @param[in] border_size (Optional) Border size. The border region will be included in the window.
- *
- * @return The maximum window the kernel can be executed on.
- */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
-{
-    return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
-}
-
-/** Intersect multiple valid regions.
- *
- * @param[in] regions Valid regions.
- *
- * @return Intersection of all regions.
- */
-template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
-{
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
-    {
-        ValidRegion region;
-
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
-        {
-            region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
-        }
-
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
-        {
-            region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
-        }
-
-        return region;
-    };
-
-    return utility::foldl(intersect, regions...);
-}
-
-/** Create a strides object based on the provided strides and the tensor dimensions.
- *
- * @param[in] info          Tensor info object providing the shape of the tensor for unspecified strides.
- * @param[in] stride_x      Stride to be used in X dimension (in bytes).
- * @param[in] fixed_strides Strides to be used in higher dimensions starting at Y (in bytes).
- *
- * @return Strides object based on the specified strides. Missing strides are
- *         calculated based on the tensor shape and the strides of lower dimensions.
- */
-template <typename T, typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info, T stride_x, Ts &&... fixed_strides)
-{
-    const TensorShape &shape = info.tensor_shape();
-
-    // Create strides object
-    Strides strides(stride_x, fixed_strides...);
-
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
-    {
-        strides.set(i, shape[i - 1] * strides[i - 1]);
-    }
-
-    return strides;
-}
-
-/** Create a strides object based on the tensor dimensions.
- *
- * @param[in] info Tensor info object used to compute the strides.
- *
- * @return Strides object based on element size and tensor shape.
- */
-template <typename... Ts>
-inline Strides compute_strides(const ITensorInfo &info)
-{
-    return compute_strides(info, info.element_size());
-}
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators);
 
 /** Permutes given Dimensions according to a permutation vector
  *
@@ -605,7 +144,7 @@ template <typename T>
 inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     auto dimensions_copy = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = (perm[i] < dimensions.num_dimensions()) ? dimensions_copy[perm[i]] : 0;
         dimensions.set(i, dimension_val);
@@ -622,86 +161,13 @@ inline void permute(Dimensions<T> &dimensions, const PermutationVector &perm)
 inline void permute(TensorShape &shape, const PermutationVector &perm)
 {
     TensorShape shape_copy = shape;
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         size_t dimension_val = (perm[i] < shape.num_dimensions()) ? shape_copy[perm[i]] : 1;
-        shape.set(i, dimension_val, false); // Avoid changes in _num_dimension
+        shape.set(i, dimension_val, false, false); // Avoid changes in _num_dimension
     }
 }
 
-/** Auto initialize the tensor info (shape, number of channels and data type) if the current assignment is empty.
- *
- * @param[in,out] info              Tensor info used to check and assign.
- * @param[in]     shape             New shape.
- * @param[in]     num_channels      New number of channels.
- * @param[in]     data_type         New data type
- * @param[in]     quantization_info (Optional) New quantization info
- *
- * @return True if the tensor info has been initialized
- */
-bool auto_init_if_empty(ITensorInfo       &info,
-                        const TensorShape &shape,
-                        int num_channels, DataType data_type,
-                        QuantizationInfo quantization_info = QuantizationInfo());
-
-/** Auto initialize the tensor info using another tensor info.
- *
- * @param info_sink   Tensor info used to check and assign
- * @param info_source Tensor info used to assign
- *
- * @return True if the tensor info has been initialized
- */
-bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source);
-
-/** Set the shape to the specified value if the current assignment is empty.
- *
- * @param[in,out] info  Tensor info used to check and assign.
- * @param[in]     shape New shape.
- *
- * @return True if the shape has been changed.
- */
-bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape);
-
-/** Set the format, data type and number of channels to the specified value if
- * the current data type is unknown.
- *
- * @param[in,out] info   Tensor info used to check and assign.
- * @param[in]     format New format.
- *
- * @return True if the format has been changed.
- */
-bool set_format_if_unknown(ITensorInfo &info, Format format);
-
-/** Set the data type and number of channels to the specified value if
- * the current data type is unknown.
- *
- * @param[in,out] info      Tensor info used to check and assign.
- * @param[in]     data_type New data type.
- *
- * @return True if the data type has been changed.
- */
-bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type);
-
-/** Set the data layout to the specified value if
- * the current data layout is unknown.
- *
- * @param[in,out] info        Tensor info used to check and assign.
- * @param[in]     data_layout New data layout.
- *
- * @return True if the data type has been changed.
- */
-bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout);
-
-/** Set the quantization info to the specified value if
- * the current quantization info is empty and the data type of asymmetric quantized type
- *
- * @param[in,out] info              Tensor info used to check and assign.
- * @param[in]     quantization_info Quantization info
- *
- * @return True if the quantization info has been changed.
- */
-bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info);
-
 /** Helper function to calculate the Valid Region for Scale.
  *
  * @param[in] src_info           Input tensor info used to check.
@@ -712,8 +178,11 @@ bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantiza
  *
  * @return The corresponding valid region
  */
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined);
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined);
 
 /** Convert a linear index into n-dimensional coordinates.
  *
@@ -733,6 +202,22 @@ inline Coordinates index2coords(const TensorShape &shape, int index);
  */
 inline int coords2index(const TensorShape &shape, const Coordinates &coord);
 
+/** Returns a static map used to find an index or dimension based on a data layout
+  *
+  * *** Layouts ***
+  *
+  * *** 4D ***
+  * [N C H W]
+  * [3 2 1 0]
+  * [N H W C]
+  *
+  * * *** 5D ***
+  * [N C D H W]
+  * [4 3 2 1 0]
+  * [N D H W C]
+  */
+const std::map<DataLayout, std::vector<DataLayoutDimension>> &get_layout_map();
+
 /** Get the index of the given dimension.
  *
  * @param[in] data_layout           The data layout.
@@ -740,7 +225,8 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord);
  *
  * @return The int conversion of the requested data layout index.
  */
-inline size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension);
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension);
 
 /** Get the DataLayoutDimension of a given index and layout.
  *
@@ -749,22 +235,7 @@ inline size_t get_data_layout_dimension_index(const DataLayout data_layout, cons
  *
  * @return The dimension which this index is requested for.
  */
-inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout data_layout, const size_t index);
-
-/** Calculate the normalization dimension index for a given normalization type
- *
- * @param[in] layout Data layout of the input and output tensor
- * @param[in] info   Normalization info
- *
- * @return Normalization dimension index
- */
-inline unsigned int get_normalization_dimension_index(DataLayout layout, const NormalizationLayerInfo &info)
-{
-    const unsigned int width_idx   = get_data_layout_dimension_index(layout, DataLayoutDimension::WIDTH);
-    const unsigned int channel_idx = get_data_layout_dimension_index(layout, DataLayoutDimension::CHANNEL);
-
-    return info.is_in_map() ? width_idx : channel_idx;
-}
+inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &data_layout, const size_t index);
 
 /** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform
  *  to know the number of tiles on the x and y direction
@@ -776,10 +247,17 @@ inline unsigned int get_normalization_dimension_index(DataLayout layout, const N
  *
  * @return the number of output tiles along the x and y directions of size "output_tile_size"
  */
-inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
+inline Size2D compute_winograd_convolution_tiles(const Size2D        &in_dims,
+                                                 const Size2D        &kernel_size,
+                                                 const Size2D        &output_tile_size,
+                                                 const PadStrideInfo &conv_info)
 {
-    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    int num_tiles_x =
+        std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) /
+                  static_cast<float>(output_tile_size.width));
+    int num_tiles_y =
+        std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) /
+                  static_cast<float>(output_tile_size.height));
 
     // Clamp in case we provide paddings but we have 1D convolution
     num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
@@ -808,40 +286,12 @@ inline T wrap_around(T x, T m)
  */
 inline Coordinates &convert_negative_axis(Coordinates &coords, int max_value)
 {
-    for(unsigned int i = 0; i < coords.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < coords.num_dimensions(); ++i)
     {
         coords[i] = wrap_around(coords[i], max_value);
     }
     return coords;
 }
-
-/** Given an integer value, this function returns the next power of two
- *
- * @param[in] x Input value
- *
- * @return the next power of two
- */
-inline unsigned int get_next_power_two(unsigned int x)
-{
-    // Decrement by 1
-    x--;
-
-    // Shift right by 1
-    x |= x >> 1u;
-    // Shift right by 2
-    x |= x >> 2u;
-    // Shift right by 4
-    x |= x >> 4u;
-    // Shift right by 8
-    x |= x >> 8u;
-    // Shift right by 16
-    x |= x >> 16u;
-
-    // Increment by 1
-    x++;
-
-    return x;
-}
 } // namespace arm_compute
 
 #include "arm_compute/core/Helpers.inl"
diff --git a/arm_compute/core/Helpers.inl b/arm_compute/core/Helpers.inl
index 233d46bb86..60a21e9418 100644
--- a/arm_compute/core/Helpers.inl
+++ b/arm_compute/core/Helpers.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,68 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 
 #include <cmath>
 #include <numeric>
 
 namespace arm_compute
 {
-inline uint8_t pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
-{
-    ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
-
-    // Calculate sampling position
-    float in_x = (x + 0.5f) * wr - 0.5f;
-    float in_y = (y + 0.5f) * hr - 0.5f;
-
-    // Get bounding box offsets
-    int x_from = std::floor(x * wr - 0.5f - in_x);
-    int y_from = std::floor(y * hr - 0.5f - in_y);
-    int x_to   = std::ceil((x + 1) * wr - 0.5f - in_x);
-    int y_to   = std::ceil((y + 1) * hr - 0.5f - in_y);
-
-    // Clamp position to borders
-    in_x = std::max(-1.f, std::min(in_x, static_cast<float>(width)));
-    in_y = std::max(-1.f, std::min(in_y, static_cast<float>(height)));
-
-    // Clamp bounding box offsets to borders
-    x_from = ((in_x + x_from) < -1) ? -1 : x_from;
-    y_from = ((in_y + y_from) < -1) ? -1 : y_from;
-    x_to   = ((in_x + x_to) > width) ? (width - in_x) : x_to;
-    y_to   = ((in_y + y_to) > height) ? (height - in_y) : y_to;
-
-    // Get pixel index
-    const int xi = std::floor(in_x);
-    const int yi = std::floor(in_y);
-
-    // Bounding box elements in each dimension
-    const int x_elements = (x_to - x_from + 1);
-    const int y_elements = (y_to - y_from + 1);
-    ARM_COMPUTE_ERROR_ON(x_elements == 0 || y_elements == 0);
-
-    // Sum pixels in area
-    int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
-    {
-        const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
-        sum                = std::accumulate(ptr, ptr + x_elements, sum);
-    }
-
-    // Return average
-    return sum / (x_elements * y_elements);
-}
-
 template <size_t dimension>
 struct IncrementIterators
 {
     template <typename T, typename... Ts>
-    static void unroll(T &&it, Ts &&... iterators)
+    static void unroll(T &&it, Ts &&...iterators)
     {
-        auto increment = [](T && it)
-        {
-            it.increment(dimension);
-        };
+        auto increment = [](T &&it) { it.increment(dimension); };
         utility::for_each(increment, std::forward<T>(it), std::forward<Ts>(iterators)...);
     }
     static void unroll()
@@ -96,14 +47,14 @@ template <size_t dim>
 struct ForEachDimension
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         const auto &d = w[dim - 1];
 
-        for(auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators < dim - 1 >::unroll(iterators...))
+        for (auto v = d.start(); v < d.end(); v += d.step(), IncrementIterators<dim - 1>::unroll(iterators...))
         {
             id.set(dim - 1, v);
-            ForEachDimension < dim - 1 >::unroll(w, id, lambda_function, iterators...);
+            ForEachDimension<dim - 1>::unroll(w, id, lambda_function, iterators...);
         }
     }
 };
@@ -112,7 +63,7 @@ template <>
 struct ForEachDimension<0>
 {
     template <typename L, typename... Ts>
-    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&... iterators)
+    static void unroll(const Window &w, Coordinates &id, L &&lambda_function, Ts &&...iterators)
     {
         ARM_COMPUTE_UNUSED(w, iterators...);
         lambda_function(id);
@@ -120,49 +71,60 @@ struct ForEachDimension<0>
 };
 
 template <typename L, typename... Ts>
-inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
+inline void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators)
 {
     w.validate();
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(w[i].step() == 0);
     }
 
     Coordinates id;
-    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function), std::forward<Ts>(iterators)...);
+    ForEachDimension<Coordinates::num_max_dimensions>::unroll(w, id, std::forward<L>(lambda_function),
+                                                              std::forward<Ts>(iterators)...);
 }
 
-inline constexpr Iterator::Iterator()
-    : _ptr(nullptr), _dims()
+inline constexpr Iterator::Iterator() : _ptr(nullptr), _dims()
 {
 }
 
-inline Iterator::Iterator(const ITensor *tensor, const Window &win)
-    : Iterator()
+inline Iterator::Iterator(const ITensor *tensor, const Window &win) : Iterator()
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->info() == nullptr);
 
-    const ITensorInfo *info    = tensor->info();
-    const Strides     &strides = info->strides_in_bytes();
+    initialize(tensor->info()->num_dimensions(), tensor->info()->strides_in_bytes(), tensor->buffer(),
+               tensor->info()->offset_first_element_in_bytes(), win);
+}
+
+inline Iterator::Iterator(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
+    : Iterator()
+{
+    initialize(num_dims, strides, buffer, offset, win);
+}
+
+inline void
+Iterator::initialize(size_t num_dims, const Strides &strides, uint8_t *buffer, size_t offset, const Window &win)
+{
+    ARM_COMPUTE_ERROR_ON(buffer == nullptr);
 
-    _ptr = tensor->buffer() + info->offset_first_element_in_bytes();
+    _ptr = buffer + offset;
 
     //Initialize the stride for each dimension and calculate the position of the first element of the iteration:
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    for (unsigned int n = 0; n < num_dims; ++n)
     {
         _dims[n]._stride = win[n].step() * strides[n];
-        std::get<0>(_dims)._dim_start += strides[n] * win[n].start();
+        std::get<0>(_dims)._dim_start += static_cast<size_t>(strides[n]) * win[n].start();
     }
 
     //Copy the starting point to all the dimensions:
-    for(unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = 1; n < Coordinates::num_max_dimensions; ++n)
     {
         _dims[n]._dim_start = std::get<0>(_dims)._dim_start;
     }
 
-    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(win, info->num_dimensions());
+    ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(win, num_dims);
 }
 
 inline void Iterator::increment(const size_t dimension)
@@ -171,13 +133,13 @@ inline void Iterator::increment(const size_t dimension)
 
     _dims[dimension]._dim_start += _dims[dimension]._stride;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
 }
 
-inline constexpr int Iterator::offset() const
+inline constexpr size_t Iterator::offset() const
 {
     return _dims.at(0)._dim_start;
 }
@@ -193,100 +155,12 @@ inline void Iterator::reset(const size_t dimension)
 
     _dims[dimension]._dim_start = _dims[dimension + 1]._dim_start;
 
-    for(unsigned int n = 0; n < dimension; ++n)
+    for (unsigned int n = 0; n < dimension; ++n)
     {
         _dims[n]._dim_start = _dims[dimension]._dim_start;
     }
 }
 
-inline bool auto_init_if_empty(ITensorInfo       &info,
-                               const TensorShape &shape,
-                               int                num_channels,
-                               DataType           data_type,
-                               QuantizationInfo   quantization_info)
-{
-    if(info.tensor_shape().total_size() == 0)
-    {
-        info.set_data_type(data_type);
-        info.set_num_channels(num_channels);
-        info.set_tensor_shape(shape);
-        info.set_quantization_info(quantization_info);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
-{
-    if(info_sink.tensor_shape().total_size() == 0)
-    {
-        info_sink.set_data_type(info_source.data_type());
-        info_sink.set_num_channels(info_source.num_channels());
-        info_sink.set_tensor_shape(info_source.tensor_shape());
-        info_sink.set_quantization_info(info_source.quantization_info());
-        info_sink.set_data_layout(info_source.data_layout());
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
-{
-    if(info.tensor_shape().total_size() == 0)
-    {
-        info.set_tensor_shape(shape);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_format_if_unknown(ITensorInfo &info, Format format)
-{
-    if(info.data_type() == DataType::UNKNOWN)
-    {
-        info.set_format(format);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
-{
-    if(info.data_type() == DataType::UNKNOWN)
-    {
-        info.set_data_type(data_type);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
-{
-    if(info.data_layout() == DataLayout::UNKNOWN)
-    {
-        info.set_data_layout(data_layout);
-        return true;
-    }
-
-    return false;
-}
-
-inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
-{
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
-    {
-        info.set_quantization_info(quantization_info);
-        return true;
-    }
-
-    return false;
-}
-
 inline Coordinates index2coords(const TensorShape &shape, int index)
 {
     int num_elements = shape.total_size();
@@ -294,9 +168,9 @@ inline Coordinates index2coords(const TensorShape &shape, int index)
     ARM_COMPUTE_ERROR_ON_MSG(index < 0 || index >= num_elements, "Index has to be in [0, num_elements]!");
     ARM_COMPUTE_ERROR_ON_MSG(num_elements == 0, "Cannot create coordinate from empty shape!");
 
-    Coordinates coord{ 0 };
+    Coordinates coord{0};
 
-    for(int d = shape.num_dimensions() - 1; d >= 0; --d)
+    for (int d = shape.num_dimensions() - 1; d >= 0; --d)
     {
         num_elements /= shape[d];
         coord.set(d, index / num_elements);
@@ -315,7 +189,7 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     int index  = 0;
     int stride = 1;
 
-    for(unsigned int d = 0; d < coord.num_dimensions(); ++d)
+    for (unsigned int d = 0; d < coord.num_dimensions(); ++d)
     {
         index += coord[d] * stride;
         stride *= shape[d];
@@ -324,61 +198,23 @@ inline int coords2index(const TensorShape &shape, const Coordinates &coord)
     return index;
 }
 
-inline size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
+inline size_t get_data_layout_dimension_index(const DataLayout          &data_layout,
+                                              const DataLayoutDimension &data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
-
-    /* Return the index based on the data layout
-     * [N C H W]
-     * [3 2 1 0]
-     * [N H W C]
-    */
-    switch(data_layout_dimension)
-    {
-        case DataLayoutDimension::CHANNEL:
-            return (data_layout == DataLayout::NCHW) ? 2 : 0;
-            break;
-        case DataLayoutDimension::HEIGHT:
-            return (data_layout == DataLayout::NCHW) ? 1 : 2;
-            break;
-        case DataLayoutDimension::WIDTH:
-            return (data_layout == DataLayout::NCHW) ? 0 : 1;
-            break;
-        case DataLayoutDimension::BATCHES:
-            return 3;
-            break;
-        default:
-            break;
-    }
-    ARM_COMPUTE_ERROR("Data layout index not supported!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
+    const auto &dims = get_layout_map().at(data_layout);
+    const auto &it   = std::find(dims.cbegin(), dims.cend(), data_layout_dimension);
+    ARM_COMPUTE_ERROR_ON_MSG(it == dims.cend(), "Invalid dimension for the given layout.");
+    return it - dims.cbegin();
 }
 
-inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout data_layout, const size_t index)
+inline DataLayoutDimension get_index_data_layout_dimension(const DataLayout &data_layout, const size_t index)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
-
-    /* Return the index based on the data layout
-    * [N C H W]
-    * [3 2 1 0]
-    * [N H W C]
-    */
-    switch(index)
-    {
-        case 0:
-            return (data_layout == DataLayout::NCHW) ? DataLayoutDimension::WIDTH : DataLayoutDimension::CHANNEL;
-            break;
-        case 1:
-            return (data_layout == DataLayout::NCHW) ? DataLayoutDimension::HEIGHT : DataLayoutDimension::WIDTH;
-            break;
-        case 2:
-            return (data_layout == DataLayout::NCHW) ? DataLayoutDimension::CHANNEL : DataLayoutDimension::HEIGHT;
-            break;
-        case 3:
-            return DataLayoutDimension::BATCHES;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Index value not supported!");
-            break;
-    }
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the layout dimension for an unknown layout!");
+    const auto &dims = get_layout_map().at(data_layout);
+    ARM_COMPUTE_ERROR_ON_MSG(index >= dims.size(), "Invalid index for the given layout.");
+    return dims[index];
 }
 } // namespace arm_compute
diff --git a/arm_compute/core/IAccessWindow.h b/arm_compute/core/IAccessWindow.h
index 227d1c4bb2..9c9fb90915 100644
--- a/arm_compute/core/IAccessWindow.h
+++ b/arm_compute/core/IAccessWindow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -100,7 +100,10 @@ public:
      * @return a valid region.
      *
      */
-    virtual ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const = 0;
+    virtual ValidRegion compute_valid_region(const Window &window,
+                                             ValidRegion   input_valid_region,
+                                             bool          border_undefined,
+                                             BorderSize    border_size) const = 0;
 };
 
 /** Implementation of a rectangular access pattern. */
@@ -161,7 +164,10 @@ public:
      * @param[in] border_undefined   (Optional) Undefined borders are excluded from the valid region.
      * @param[in] border_size        (Optional) Size of the border around the XY-plane of the tensor.
      */
-    void set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined = false, const BorderSize &border_size = BorderSize(0));
+    void set_valid_region(const Window      &window,
+                          const ValidRegion &input_valid_region,
+                          bool               border_undefined = false,
+                          const BorderSize  &border_size      = BorderSize(0));
 
     /** Compute the valid region based on access pattern, valid region of the inputs and border mode.
      *
@@ -189,7 +195,10 @@ public:
      * @return a valid region.
      *
      */
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
diff --git a/arm_compute/core/IArray.h b/arm_compute/core/IArray.h
index c6a1499698..3471fc9a86 100644
--- a/arm_compute/core/IArray.h
+++ b/arm_compute/core/IArray.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,30 +25,24 @@
 #define ARM_COMPUTE_IARRAY_H
 
 #include "arm_compute/core/Error.h"
+
 #include <cstddef>
 #include <cstdint>
 
 namespace arm_compute
 {
-struct KeyPoint;
-struct Coordinates2D;
-struct DetectionWindow;
-class Size2D;
-
 /** Array of type T */
 template <class T>
 class IArray
 {
 public:
     /** Default constructor */
-    IArray()
-        : _num_values(0), _max_size(0) {};
+    IArray() : _num_values(0), _max_size(0){};
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    IArray(size_t max_num_values)
-        : _num_values(0), _max_size(max_num_values)
+    IArray(size_t max_num_values) : _num_values(0), _max_size(max_num_values)
     {
     }
     /** Maximum number of values which can be stored in this array
@@ -78,7 +72,7 @@ public:
     bool push_back(const T &val)
     {
         ARM_COMPUTE_ERROR_ON(0 == _max_size);
-        if(_num_values >= max_num_values())
+        if (_num_values >= max_num_values())
         {
             _num_values = max_num_values() + 1;
             return false;
@@ -135,14 +129,6 @@ private:
     size_t _num_values;
     size_t _max_size;
 };
-/** Interface for Array of Key Points. */
-using IKeyPointArray = IArray<KeyPoint>;
-/** Interface for Array of 2D Coordinates. */
-using ICoordinates2DArray = IArray<Coordinates2D>;
-/** Interface for Array of Detection Windows. */
-using IDetectionWindowArray = IArray<DetectionWindow>;
-/** Interface for Array of 2D Sizes. */
-using ISize2DArray = IArray<Size2D>;
 /** Interface for Array of uint8s. */
 using IUInt8Array = IArray<uint8_t>;
 /** Interface for Array of uint16s. */
@@ -155,5 +141,5 @@ using IInt16Array = IArray<int16_t>;
 using IInt32Array = IArray<int32_t>;
 /** Interface for Array of floats. */
 using IFloatArray = IArray<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IARRAY_H */
diff --git a/arm_compute/core/IDevice.h b/arm_compute/core/IDevice.h
index 5cffe646d4..12efa91e19 100644
--- a/arm_compute/core/IDevice.h
+++ b/arm_compute/core/IDevice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,6 @@ enum class DeviceType
 {
     NEON,
     CL,
-    GLES
 };
 
 /** Interface for device object */
diff --git a/arm_compute/core/IDistribution.h b/arm_compute/core/IDistribution.h
deleted file mode 100644
index cd6f25fd47..0000000000
--- a/arm_compute/core/IDistribution.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IDISTRIBUTION_H
-#define ARM_COMPUTE_IDISTRIBUTION_H
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Interface for distribution objects */
-class IDistribution
-{
-public:
-    /** Default virtual destructor */
-    virtual ~IDistribution() = default;
-    /** Returns the dimensions of the distribution.
-     *
-     * @note  This is fixed to 1-dimensional distribution for now.
-     * @return Dimensions of the distribution.
-     */
-    virtual size_t dimensions() const = 0;
-    /** Returns the total size in bytes of the distribution.
-     *
-     * @return Total size of the distribution in bytes.
-     */
-    virtual size_t size() const = 0;
-    /** Returns a pointer to the start of the distribution.
-     * Other elements of the array can be accessed using buffer()[idx] for 0 <= idx < num_bins()
-     *
-     * @return Pointer to the start of the distribution.
-     */
-    virtual uint32_t *buffer() const = 0;
-    /** Clears the distribution by setting every element to zero. */
-    void clear() const;
-};
-}
-#endif /* ARM_COMPUTE_IDISTRIBUTION_H */
diff --git a/arm_compute/core/IDistribution1D.h b/arm_compute/core/IDistribution1D.h
deleted file mode 100644
index 081ba580db..0000000000
--- a/arm_compute/core/IDistribution1D.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IDISTRIBUTION1D_H
-#define ARM_COMPUTE_IDISTRIBUTION1D_H
-
-#include "arm_compute/core/IDistribution.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-/** 1D Distribution interface */
-class IDistribution1D : public IDistribution
-{
-public:
-    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
-     *               defined by a start offset and valid range, divided equally into num_bins parts.
-     *
-     * @param[in] num_bins The number of bins the distribution is divided in.
-     * @param[in] offset   The start of the values to use.
-     * @param[in] range    The total number of the consecutive values of the distribution interval.
-     */
-    IDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
-    /** Returns the number of bins that the distribution has.
-     *
-     * @return Number of bins of the distribution.
-     */
-    size_t num_bins() const;
-    /** Returns the offset of the distribution.
-     *
-     * @return Offset of the distribution.
-     */
-    int32_t offset() const;
-    /** Returns the range of the distribution.
-     *
-     * @return Range of the distribution.
-     */
-    uint32_t range() const;
-    /** Returns the window of the distribution, which is the range divided by the number of bins.
-     *
-     * @note If range is not divided by the number of bins then it is invalid.
-     *
-     * @return Window of the distribution.
-     */
-    uint32_t window() const;
-    /** Sets the range of the distribution.
-     *
-     * @param[in] range New range of the distribution to be set.
-     */
-    void set_range(uint32_t range);
-
-    // Inherited methods overridden:
-    size_t size() const override;
-    size_t dimensions() const override;
-
-private:
-    size_t   _num_bins; /**< Number of bins. */
-    int32_t  _offset;   /**< Offset, which indicate the start of the usable values. */
-    uint32_t _range;    /**< The total number of consecutive values of the distribution interval */
-};
-}
-#endif /* ARM_COMPUTE_IDISTRIBUTION1D_H */
diff --git a/arm_compute/core/IHOG.h b/arm_compute/core/IHOG.h
deleted file mode 100644
index bf8bd73087..0000000000
--- a/arm_compute/core/IHOG.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IHOG_H
-#define ARM_COMPUTE_IHOG_H
-
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-class HOGInfo;
-/** Interface for HOG data-object */
-class IHOG
-{
-public:
-    /** Interface to be implemented by the child class to return the HOG's metadata
-     *
-     * @return A pointer to the HOG's metadata.
-     */
-    virtual const HOGInfo *info() const = 0;
-    /** Default virtual destructor */
-    virtual ~IHOG() = default;
-    /** Pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     *
-     * @note Other elements of the array can be accessed using descriptor()[idx] for idx=[0, descriptor_size() - 1]
-     *
-     * @return A pointer to the first element of the array which stores the linear SVM coefficients of HOG descriptor
-     */
-    virtual float *descriptor() const = 0;
-};
-}
-#endif /* ARM_COMPUTE_IHOG_H */
diff --git a/arm_compute/core/IKernel.h b/arm_compute/core/IKernel.h
index cb1ddb1d7f..403a2c724e 100644
--- a/arm_compute/core/IKernel.h
+++ b/arm_compute/core/IKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,6 +57,11 @@ public:
      * @return The maximum window the kernel can be executed on.
      */
     const Window &window() const;
+    /** Function to check if the embedded window of this kernel has been configured
+     *
+     * @return True if the windows has been configured
+     */
+    bool is_window_configured() const;
 
 protected:
     /** Configure the kernel's window
@@ -68,5 +73,5 @@ protected:
 private:
     Window _window;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IKERNEL_H */
diff --git a/arm_compute/core/ILut.h b/arm_compute/core/ILut.h
deleted file mode 100644
index d1a03af969..0000000000
--- a/arm_compute/core/ILut.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ILUT_H
-#define ARM_COMPUTE_ILUT_H
-
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Lookup Table object interface. */
-class ILut
-{
-public:
-    /** Default virtual destructor */
-    virtual ~ILut() = default;
-    /** Returns the total number of elements in the LUT.
-     *
-     * @return Total number of elements.
-     */
-    virtual size_t num_elements() const = 0;
-    /** Indicates the offset that needs to be applied to the raw index before performing a lookup in the LUT.
-     *
-     * @return The normalization offset.
-     */
-    virtual uint32_t index_offset() const = 0;
-    /** Returns the total size in bytes of the LUT.
-     *
-     * @return Total size of the LUT in bytes.
-     */
-    virtual size_t size_in_bytes() const = 0;
-    /** Returns the type of the LUT.
-     *
-     * @return The type of the LUT.
-     */
-    virtual DataType type() const = 0;
-    /** Returns a pointer to the start of the LUT.
-     * Other elements of the LUT can be accessed using buffer()[idx] for 0 <= idx < num_elements().
-     *
-     * @return Pointer to the start of the lut.
-     */
-    virtual uint8_t *buffer() const = 0;
-    /** Clears the LUT by setting every element to zero. */
-    virtual void clear() = 0;
-};
-}
-#endif /* ARM_COMPUTE_ILUT_H */
diff --git a/arm_compute/core/IMultiHOG.h b/arm_compute/core/IMultiHOG.h
deleted file mode 100644
index ab79fac154..0000000000
--- a/arm_compute/core/IMultiHOG.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IMULTIHOG_H
-#define ARM_COMPUTE_IMULTIHOG_H
-
-#include "arm_compute/core/IHOG.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Interface for storing multiple HOG data-objects */
-class IMultiHOG
-{
-public:
-    /** Default destructor */
-    virtual ~IMultiHOG() = default;
-    /** The number of HOG models stored
-     *
-     * @return The number of HOG models stored
-     */
-    virtual size_t num_models() const = 0;
-    /** Return a pointer to the requested HOG model
-     *
-     * @param[in] index The index of the wanted HOG model.
-     *
-     *  @return A pointer pointed to the HOG model
-     */
-    virtual IHOG *model(size_t index) = 0;
-    /** Return a const pointer to the requested HOG model
-     *
-     * @param[in] index The index of the wanted HOG model.
-     *
-     *  @return A const pointer pointed to the HOG model
-     */
-    virtual const IHOG *model(size_t index) const = 0;
-};
-}
-
-#endif /* ARM_COMPUTE_IMULTIHOG_H */
diff --git a/arm_compute/core/IPyramid.h b/arm_compute/core/IPyramid.h
deleted file mode 100644
index b2a74656b6..0000000000
--- a/arm_compute/core/IPyramid.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_IPYRAMID_H
-#define ARM_COMPUTE_IPYRAMID_H
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Interface for pyramid data-object */
-class IPyramid
-{
-public:
-    /** Default virtual destructor */
-    virtual ~IPyramid() = default;
-    /** Interface to be implemented by the child class to return the Pyramid's metadata
-     *
-     * @return A pointer to the Pyramid's metadata.
-     */
-    virtual const PyramidInfo *info() const = 0;
-    /** Retrieves a level of the pyramid as a ITensor pointer
-     *
-     * @param[in] index The index of the level, such that index is less than levels.
-     *
-     *  @return An ITensor pointer
-     */
-    virtual ITensor *get_pyramid_level(size_t index) const = 0;
-};
-}
-
-#endif /* ARM_COMPUTE_IPYRAMID_H */
diff --git a/arm_compute/core/ITensor.h b/arm_compute/core/ITensor.h
index 501279eb25..aad8313261 100644
--- a/arm_compute/core/ITensor.h
+++ b/arm_compute/core/ITensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@ namespace arm_compute
 {
 class Coordinates;
 
-/** Interface for NEON tensor */
+/** Interface for CPU tensor */
 class ITensor
 {
 public:
@@ -90,11 +90,13 @@ public:
     bool is_used() const;
     /** Marks a tensor as unused */
     void mark_as_unused() const;
+    /** Marks a tensor as used */
+    void mark_as_used() const;
 
 private:
-    mutable bool _is_used = { true }; /**< Flag that marks if the tensor is used or not */
+    mutable bool _is_used = {true}; /**< Flag that marks if the tensor is used or not */
 };
 
 using IImage = ITensor;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSOR_H */
diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index f2b4c155aa..c42f4b57a1 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,18 +28,46 @@
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ICloneable.h"
 #include "arm_compute/core/utils/misc/Utility.h"
 
+#include "support/ICloneable.h"
+
 #include <cstddef>
 
 namespace arm_compute
 {
+class QuantizationInfo;
+// Note: Any changes to the fields of the class below that have setters should be mirrored
+// (if possible) in the auto_init_if_empty function in AutoConfiguration.h
+
 /** Store the tensor's metadata */
 class ITensorInfo : public misc::ICloneable<ITensorInfo>
 {
 public:
+    using TensorDimsState = std::vector<int>;
+    /** An id that uniquely identifies an ITensorInfo within some domain (e.g. a workload)
+     */
+    using Id = int32_t;
+    /** An invalid tensor id within a domain */
+    static constexpr Id invalid_tensor_id = 0;
+    /** Get the value representing dynamic dimension state
+     *
+     * @return Value representing dynamic dimension state
+     *
+     */
+    static constexpr int32_t get_dynamic_state_value()
+    {
+        return _dynamic_dimension;
+    }
+    /** Get the value representing static dimension state
+     *
+     * @return Value representing static dimension state
+     *
+     */
+    static constexpr int32_t get_static_state_value()
+    {
+        return _static_dimension;
+    }
     /** Default virtual destructor */
     virtual ~ITensorInfo() = default;
     /** Set the data type to the specified value.
@@ -81,6 +109,17 @@ public:
      * @return Reference to this ITensorInfo object
      */
     virtual ITensorInfo &set_tensor_shape(const TensorShape &shape) = 0;
+    /** Set the state for each dimension of the tensor
+     *
+     * This sets the state of each dimension of the shape in terms of dynamic behavior using -1 where appropriate.
+     * The index in the state is a 1 to 1 mapping with the shape dimension index.
+     * For example if you want to express [?, 3, 3] as a dynamic input then [-1, 3, 3] has to be set as a state
+     *
+     * @param[in] state Tensor dimensions state
+     *
+     * @return Reference to this ITensorInfo object
+     */
+    virtual ITensorInfo &set_tensor_dims_state(const TensorDimsState &state) = 0;
     /** Set the quantization settings (scale and offset) of the tensor.
      *
      * @param[in] quantization_info QuantizationInfo containing the scale and offset
@@ -107,6 +146,17 @@ public:
      * @return True if the strides or the offset to the first element have changed.
      */
     virtual bool auto_padding() = 0;
+    /** Set the lock paddings flag of the tensor.
+     * It should be set to True, when the tensor could be mapped to camera or frame buffer.
+     *
+     * @return Reference to this ITensorInfo object
+     */
+    virtual ITensorInfo &set_lock_paddings(bool flag) = 0;
+    /** Get the lock paddings flag value
+     *
+     * @return lock paddings flag value
+     */
+    virtual bool lock_paddings() const = 0;
     /** Update the offset to the first element, the strides and the total size.
      *
      * @note This function can only increase the offset, strides and total size.
@@ -170,6 +220,11 @@ public:
      * @return A vector with the size for each dimension of the tensor
      */
     virtual const TensorShape &tensor_shape() const = 0;
+    /** State of each dimension of the tensor shape
+     *
+     * @return A vector with the state for each dimension of the tensor, where -1 specifies dynamic dimension
+     */
+    virtual const TensorDimsState &tensor_dims_state() const = 0;
     /** Data type used for each element of the tensor
      *
      * @return Tensor data type
@@ -205,6 +260,11 @@ public:
      * @return True if its dynamic else false
      */
     virtual bool is_dynamic() const = 0;
+    /** Flag indicating whether the values of the tensor are constant, meaning that they can change on kernel/function execution.
+     *
+     * @return True if values are constant else false
+     */
+    virtual bool are_values_constant() const = 0;
     /** Set the flag whether the tensor size can be changed.
      *
      * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
@@ -212,13 +272,13 @@ public:
      * @return Reference to this ITensorInfo object
      */
     virtual ITensorInfo &set_is_resizable(bool is_resizable) = 0;
-    /** Set the flag whether the tensor size is dynamic.
+    /** Set the flag whether the tensor values can change during kernel/function execution.
      *
-     * @param[in] is_dynamic Flag that marks the tensor if it's dynamic.
+     * @param[in] are_values_constant Flag that marks the tensor values if they can be changed or not.
      *
      * @return Reference to this ITensorInfo object
      */
-    virtual ITensorInfo &set_is_dynamic(bool is_dynamic) = 0;
+    virtual ITensorInfo &set_are_values_constant(bool are_values_constant) = 0;
     /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
      *
      * @return The valid region.
@@ -240,7 +300,20 @@ public:
     * @return A DataLayout containing the layout data information.
     */
     virtual DataLayout data_layout() const = 0;
-
+    /** Get the workload tensor id of the tensor.
+    *
+    * @return Workload tensor id of the tensor
+    */
+    virtual Id id() const = 0;
+    /** Set the tensor id
+    */
+    virtual ITensorInfo &set_id(ITensorInfo::Id id) = 0;
+    /** Check if the tensor id is valid
+     */
+    bool has_valid_id() const
+    {
+        return id() != invalid_tensor_id;
+    }
     /** If infos are broadcast compatible tensor info's, return the broadcasted shape and the intersection of
      * the broadcasted valid regions of the tensors.
      *
@@ -256,23 +329,23 @@ public:
      * not broadcast compatible.
      */
     template <typename... Infos>
-    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &... infos)
+    static std::pair<TensorShape, ValidRegion> broadcast_shape_and_valid_region(const Infos &...infos)
     {
         TensorShape bc_shape = TensorShape::broadcast_shape(infos.tensor_shape()...);
-        ValidRegion bc_valid_region{ Coordinates(), bc_shape };
+        ValidRegion bc_valid_region{Coordinates(), bc_shape};
 
-        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo & info)
+        auto broadcast_valid_region = [&bc_valid_region](const ITensorInfo &info)
         {
-            if(info.num_dimensions() != 0)
+            if (info.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
+                for (size_t d = 0; d < bc_valid_region.shape.num_dimensions(); ++d)
                 {
                     const bool is_broadcast = (info.tensor_shape()[d] == 1);
 
                     const int    anchor_max = std::max(bc_valid_region.anchor[d], info.valid_region().anchor[d]);
                     const size_t valid_min  = std::min(bc_valid_region.shape[d], info.valid_region().shape[d]);
 
-                    if(!is_broadcast || (valid_min == 0))
+                    if (!is_broadcast || (valid_min == 0))
                     {
                         bc_valid_region.anchor.set(d, anchor_max);
                         bc_valid_region.shape.set(d, valid_min);
@@ -285,6 +358,10 @@ public:
 
         return std::pair<TensorShape, ValidRegion>(bc_shape, bc_valid_region);
     }
+
+private:
+    static constexpr int32_t _dynamic_dimension = -1;
+    static constexpr int32_t _static_dimension  = 0;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/ITensorPack.h b/arm_compute/core/ITensorPack.h
new file mode 100644
index 0000000000..f456c50769
--- /dev/null
+++ b/arm_compute/core/ITensorPack.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ITENSORPACK_H
+#define ARM_COMPUTE_ITENSORPACK_H
+
+#include "arm_compute/core/experimental/Types.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <unordered_map>
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Tensor packing service */
+class ITensorPack
+{
+public:
+    struct PackElement
+    {
+        PackElement() = default;
+        PackElement(int id, ITensor *tensor) : id(id), tensor(tensor), ctensor(nullptr)
+        {
+        }
+        PackElement(int id, const ITensor *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
+        {
+        }
+
+        int            id{-1};
+        ITensor       *tensor{nullptr};
+        const ITensor *ctensor{nullptr};
+    };
+
+public:
+    /** Default Constructor */
+    ITensorPack() = default;
+    /**  Initializer list Constructor */
+    ITensorPack(std::initializer_list<PackElement> l);
+    /** Add tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, ITensor *tensor);
+
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_tensor(int id, const ITensor *tensor);
+
+    /** Add const tensor to the pack
+     *
+     * @param[in] id     ID/type of the tensor to add
+     * @param[in] tensor Tensor to add
+     */
+    void add_const_tensor(int id, const ITensor *tensor);
+    /** Get tensor of a given id from the pac
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is non-const else nullptr
+     */
+    ITensor *get_tensor(int id);
+    /** Get constant tensor of a given id
+     *
+     * @param[in] id ID of tensor to extract
+     *
+     * @return The pointer to the tensor if exist and is const else nullptr
+     */
+    const ITensor *get_const_tensor(int id) const;
+    /** Remove the tensor stored with the given id
+     *
+     * @param[in] id ID of tensor to remove
+     */
+    void remove_tensor(int id);
+    /** Pack size accessor
+     *
+     * @return Number of tensors registered to the pack
+     */
+    size_t size() const;
+    /** Checks if pack is empty
+     *
+     * @return True if empty else false
+     */
+    bool empty() const;
+
+private:
+    std::unordered_map<int, PackElement> _pack{}; /**< Container with the packed tensors */
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ITENSORPACK_H */
diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h
index de08288dec..168a06a55c 100644
--- a/arm_compute/core/KernelDescriptors.h
+++ b/arm_compute/core/KernelDescriptors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,113 +21,139 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H
-#define ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 
 namespace arm_compute
 {
 /** Descriptor for FFT scale kernels */
 struct FFTScaleKernelInfo
 {
-    float scale{ 0.f };      /**< Axis to perform the kernel on. */
-    bool  conjugate{ true }; /**< Flag to conjugate the output/ */
+    float scale{0.f};      /**< Axis to perform the kernel on. */
+    bool  conjugate{true}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor for FFT digit reverse kernels */
 struct FFTDigitReverseKernelInfo
 {
-    unsigned int axis{ 0 };          /**< Axis to perform the kernel on. */
-    bool         conjugate{ false }; /**< Flag to conjugate the output/ */
+    unsigned int axis{0};          /**< Axis to perform the kernel on. */
+    bool         conjugate{false}; /**< Flag to conjugate the output/ */
 };
 
 /** Descriptor used by the FFT core kernels */
 struct FFTRadixStageKernelInfo
 {
-    unsigned int axis{ 0 };               /**< Axis to run the kernel on. */
-    unsigned int radix{ 0 };              /**< Radix to use. */
-    unsigned int Nx{ 0 };                 /**< Nx coefficient. */
-    bool         is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
+    unsigned int axis{0};               /**< Axis to run the kernel on. */
+    unsigned int radix{0};              /**< Radix to use. */
+    unsigned int Nx{0};                 /**< Nx coefficient. */
+    bool         is_first_stage{false}; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */
 };
 
+class ITensorInfo;
 /** Descriptor used by the GEMM kernels */
 struct GEMMKernelInfo
 {
     GEMMKernelInfo() = default;
-    GEMMKernelInfo(
-        unsigned int        im,
-        unsigned int        in,
-        unsigned int        ik,
-        unsigned int        idepth_output_gemm3d,
-        bool                ireinterpret_input_as_3d,
-        bool                ibroadcast_bias,
-        bool                ifp_mixed_precision,
-        ActivationLayerInfo iactivation_info,
-        int                 inmult_transpose1xW_width,
-        int                 imult_interleave4x4_height,
-        GEMMLHSMatrixInfo   ilhs_info,
-        GEMMRHSMatrixInfo   irhs_info,
-        int32_t             ina_offset,
-        int32_t             inb_offset)
-        : m(im), n(in), k(ik), depth_output_gemm3d(idepth_output_gemm3d), reinterpret_input_as_3d(ireinterpret_input_as_3d), broadcast_bias(ibroadcast_bias), fp_mixed_precision(ifp_mixed_precision),
-          activation_info(iactivation_info), mult_transpose1xW_width(inmult_transpose1xW_width), mult_interleave4x4_height(imult_interleave4x4_height), lhs_info(ilhs_info), rhs_info(irhs_info),
-          a_offset(ina_offset), b_offset(inb_offset)
+    GEMMKernelInfo(unsigned int        im,
+                   unsigned int        in,
+                   unsigned int        ik,
+                   unsigned int        idepth_output_gemm3d,
+                   bool                ireinterpret_input_as_3d,
+                   bool                ibroadcast_bias,
+                   bool                ifp_mixed_precision,
+                   bool                ihas_pad_y,
+                   ActivationLayerInfo iactivation_info,
+                   int                 inmult_transpose1xW_width,
+                   int                 imult_interleave4x4_height,
+                   GEMMLHSMatrixInfo   ilhs_info,
+                   GEMMRHSMatrixInfo   irhs_info,
+                   int32_t             ina_offset,
+                   int32_t             inb_offset)
+        : m(im),
+          n(in),
+          k(ik),
+          depth_output_gemm3d(idepth_output_gemm3d),
+          reinterpret_input_as_3d(ireinterpret_input_as_3d),
+          broadcast_bias(ibroadcast_bias),
+          fp_mixed_precision(ifp_mixed_precision),
+          has_pad_y(ihas_pad_y),
+          activation_info(iactivation_info),
+          mult_transpose1xW_width(inmult_transpose1xW_width),
+          mult_interleave4x4_height(imult_interleave4x4_height),
+          lhs_info(ilhs_info),
+          rhs_info(irhs_info),
+          a_offset(ina_offset),
+          b_offset(inb_offset)
     {
     }
 
-    unsigned int            m{ 0 };                           /**< Number of LHS rows*/
-    unsigned int            n{ 0 };                           /**< Number of RHS columns*/
-    unsigned int            k{ 0 };                           /**< Number of LHS columns or RHS rows */
-    unsigned int            depth_output_gemm3d{ 0 };         /**< Depth of the output tensor in case is reinterpreted as 3D */
-    bool                    reinterpret_input_as_3d{ false }; /**< Flag used to reinterpret the input as 3D */
-    bool                    broadcast_bias{ false };          /**< Flag used to broadcast the bias addition */
-    bool                    fp_mixed_precision{ false };      /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
-    ActivationLayerInfo     activation_info{};                /**< Activation function to perform after the matrix multiplication */
-    int                     mult_transpose1xW_width{ 1 };     /**< Multiplication factor for the width of the 1xW transposed block */
-    int                     mult_interleave4x4_height{ 1 };   /**< Multiplication factor for the height of the 4x4 interleaved block */
-    GEMMLHSMatrixInfo       lhs_info{};                       /**< LHS matrix information used to retrieve the number of rows processed by each thread */
-    GEMMRHSMatrixInfo       rhs_info{};                       /**< RHS matrix information used for reshaping the RHS matrix */
-    int32_t                 a_offset{ 0 };                    /**< Offset to be added to each element of the matrix A */
-    int32_t                 b_offset{ 0 };                    /**< Offset to be added to each element of the matrix B */
-    GEMMLowpOutputStageInfo output_stage{};                   /**< GEMMLowp output stage information */
+    unsigned int m{0};                           /**< Number of LHS rows*/
+    unsigned int n{0};                           /**< Number of RHS columns*/
+    unsigned int k{0};                           /**< Number of LHS columns or RHS rows */
+    unsigned int depth_output_gemm3d{0};         /**< Depth of the output tensor in case is reinterpreted as 3D */
+    bool         reinterpret_input_as_3d{false}; /**< Flag used to reinterpret the input as 3D */
+    bool         broadcast_bias{false};          /**< Flag used to broadcast the bias addition */
+    bool fp_mixed_precision{false}; /**< Flag used to indicate wider accumulators (32 bit instead of 16 for FP16). */
+    bool has_pad_y{
+        false}; /**< Flag used to indicate if the input/output tensors have internal pad on the y direction */
+    ActivationLayerInfo activation_info{}; /**< Activation function to perform after the matrix multiplication */
+    int mult_transpose1xW_width{1};        /**< Multiplication factor for the width of the 1xW transposed block */
+    int mult_interleave4x4_height{1};      /**< Multiplication factor for the height of the 4x4 interleaved block */
+    GEMMLHSMatrixInfo
+        lhs_info{}; /**< LHS matrix information used to retrieve the number of rows processed by each thread */
+    GEMMRHSMatrixInfo       rhs_info{};     /**< RHS matrix information used for reshaping the RHS matrix */
+    int32_t                 a_offset{0};    /**< Offset to be added to each element of the matrix A */
+    int32_t                 b_offset{0};    /**< Offset to be added to each element of the matrix B */
+    GEMMLowpOutputStageInfo output_stage{}; /**< GEMMLowp output stage information */
 };
 
-/** Descriptor used by the depthwise convolution kernels */
-struct DWCKernelInfo
+/** Compute descriptor used by the depthwise convolution native kernel */
+struct DWCComputeKernelInfo
 {
-    ActivationLayerInfo activation_info{}; /**< Activation function to perform after the depthwise convolution */
+    unsigned int n0{1};                             /**< Number of columns processed by each thread */
+    unsigned int m0{1};                             /**< Number of rows processed by each thread */
+    bool         export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
 };
 
-/** Descriptor used by the depthwise convolution kernels to retrieve the number of output elements processed by each thread */
-struct DWCWeightsKernelInfo
+/** Compute descriptor used by the direct convolution kernel */
+struct DirectConvComputeKernelInfo
 {
-    unsigned int n0{ 0 }; /**< Number of columns processed by each thread */
+    int32_t m0{1}; /**< Number of rows to be processed by the kernel */
+    int32_t n0{1}; /**< Number of columns to be processed by the kernel */
+    int32_t k0{1}; /**< Number of partial accumulations to be processed in a single iteration by the kernel */
+    bool    export_weights_to_cl_image{false}; /**< Flag to export the weights to cl_image */
+    bool    export_output_to_cl_image{false};  /**< Flag to export the output to cl_image */
+    bool    export_input_to_cl_image{false};   /**< Flag to export the input to cl_image */
 };
 
 /** Descriptor used by the softmax kernels */
 struct SoftmaxKernelInfo
 {
-    float    beta{ 1.f };                          /**< A scaling factor for the exponent with default value 1.0 */
-    bool     is_log{ false };                      /**< Flag used to perform Log Softmax operation */
-    DataType input_data_type{ DataType::UNKNOWN }; /**< Input tensor data type */
+    float    beta{1.f};                          /**< A scaling factor for the exponent with default value 1.0 */
+    bool     is_log{false};                      /**< Flag used to perform Log Softmax operation */
+    DataType input_data_type{DataType::UNKNOWN}; /**< Input tensor data type */
+    int32_t  axis{0};                            /**< The dimension in which to apply softmax. */
 };
 
 /** Descriptor used by the direct convolution layer output stage kernels */
 struct DirectConvolutionLayerOutputStageKernelInfo
 {
-    int32_t  result_fixedpoint_multiplier{ 0 };     /**< Result output stage multiplier used for quantizing */
-    int32_t  result_shift{ 0 };                     /**< Result output stage shift used for quantizing */
-    int32_t  result_offset_after_shift{ 0 };        /**< Result offset used for quantizing */
-    DataType output_data_type{ DataType::UNKNOWN }; /**< Output tensor data type to use if the output is not initialized */
+    int32_t  result_fixedpoint_multiplier{0}; /**< Result output stage multiplier used for quantizing */
+    int32_t  result_shift{0};                 /**< Result output stage shift used for quantizing */
+    int32_t  result_offset_after_shift{0};    /**< Result offset used for quantizing */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
 };
 
 struct InstanceNormalizationLayerKernelInfo
 {
     /** Default constructor */
-    InstanceNormalizationLayerKernelInfo()
-        : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
+    InstanceNormalizationLayerKernelInfo() : InstanceNormalizationLayerKernelInfo(1.f, 0.f, 1e-12, true)
     {
     }
     /** Constructor
@@ -164,10 +190,10 @@ struct GEMMLowpReductionKernelInfo
     {
     }
 
-    int32_t k{ 0 };                 /**< Number of matrix columns/rows */
-    bool    is_reshaped{ false };   /**< True if the input tensor has been reshaped */
-    int32_t scalar{ 0 };            /**< Scalar value to multiply each reduced column/row by */
-    bool    mul_by_scalar{ false }; /**< True if each column/row reduction has to be multiplied by a scalar value */
+    int32_t k{0};                 /**< Number of matrix columns/rows */
+    bool    is_reshaped{false};   /**< True if the input tensor has been reshaped */
+    int32_t scalar{0};            /**< Scalar value to multiply each reduced column/row by */
+    bool    mul_by_scalar{false}; /**< True if each column/row reduction has to be multiplied by a scalar value */
 };
 
 struct ScaleKernelInfo
@@ -180,19 +206,22 @@ struct ScaleKernelInfo
      * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
      * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
      * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in] data_layout           (Optional) Data layout used by the layer. Defaults to @ref DataLayout::UNKNOWN
      */
     ScaleKernelInfo(InterpolationPolicy interpolation_policy,
                     BorderMode          border_mode,
                     PixelValue          constant_border_value = PixelValue(),
                     SamplingPolicy      sampling_policy       = SamplingPolicy::CENTER,
                     bool                use_padding           = true,
-                    bool                align_corners         = false)
-        : interpolation_policy{ interpolation_policy },
-          border_mode{ border_mode },
-          constant_border_value{ constant_border_value },
-          sampling_policy{ sampling_policy },
-          use_padding{ use_padding },
-          align_corners{ align_corners }
+                    bool                align_corners         = false,
+                    DataLayout          data_layout           = DataLayout::UNKNOWN) noexcept
+        : interpolation_policy{interpolation_policy},
+          border_mode{border_mode},
+          constant_border_value{constant_border_value},
+          sampling_policy{sampling_policy},
+          use_padding{use_padding},
+          align_corners{align_corners},
+          data_layout{data_layout}
     {
     }
 
@@ -202,6 +231,23 @@ struct ScaleKernelInfo
     SamplingPolicy      sampling_policy;       /**< Sampling policy used by the interpolation. */
     bool                use_padding;           /**< Indication of using padding */
     bool                align_corners;         /**< Align corners of input and output */
+    DataLayout          data_layout;           /**< Data layout to use */
+};
+
+struct MatMulKernelInfo
+{
+    MatMulKernelInfo() = default;
+    MatMulKernelInfo(
+        bool adj_lhs, bool adj_rhs, int m0 = 1, int n0 = 1, int k0 = 1, bool export_rhs_to_cl_image = false)
+        : adj_lhs{adj_lhs}, adj_rhs{adj_rhs}, m0{m0}, n0{n0}, k0{k0}, export_rhs_to_cl_image{export_rhs_to_cl_image}
+    {
+    }
+    bool adj_lhs{false};                /**< Get Adjoint LHS flag value */
+    bool adj_rhs{false};                /**< Get Adjoint RHS flag value */
+    int  m0{1};                         /**< Number of output rows processed by each work-item*/
+    int  n0{1};                         /**< Number of output columns processed by each work-item*/
+    int  k0{1};                         /**< Number of inner accumulations */
+    bool export_rhs_to_cl_image{false}; /**< Flag to know whether the RHS tensor should be exported to cl_image*/
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_CORE_KERNELDESCRIPTORS_H
diff --git a/arm_compute/core/Log.h b/arm_compute/core/Log.h
index 1515557f4c..03b861f765 100644
--- a/arm_compute/core/Log.h
+++ b/arm_compute/core/Log.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,11 +34,11 @@
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()                                   \
     do                                                                             \
     {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr)  \
+        if (arm_compute::logging::LoggerRegistry::get().logger("CORE") == nullptr) \
         {                                                                          \
             arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
         }                                                                          \
-    } while(false)
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
@@ -53,7 +53,7 @@
     {                                                \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();    \
         ARM_COMPUTE_LOG_MSG("CORE", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the core system logger
  *
@@ -66,7 +66,7 @@
     {                                                                         \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                             \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("CORE", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log a stream to the core system logger
  *
@@ -78,7 +78,7 @@
     {                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();      \
         ARM_COMPUTE_LOG_STREAM("CORE", log_level, ss); \
-    } while(false)
+    } while (false)
 
 /** Log information level message to the core system logger
  *
@@ -89,7 +89,7 @@
     {                                                                        \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                            \
         ARM_COMPUTE_LOG_MSG_CORE(arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Log information level formatted message to the core system logger
  *
@@ -101,7 +101,7 @@
     {                                                                                                  \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                                                      \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT_CORE(arm_compute::logging::LogLevel::INFO, #fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log information level stream to the core system logger
  *
@@ -112,6 +112,6 @@
     {                                                                          \
         ARM_COMPUTE_CREATE_DEFAULT_CORE_LOGGER();                              \
         ARM_COMPUTE_LOG_STREAM_CORE(arm_compute::logging::LogLevel::INFO, ss); \
-    } while(false)
+    } while (false)
 
 #endif /* ARM_COMPUTE_LOGGING_MACROS_H */
diff --git a/arm_compute/core/MultiImageInfo.h b/arm_compute/core/MultiImageInfo.h
deleted file mode 100644
index fcd7ba744d..0000000000
--- a/arm_compute/core/MultiImageInfo.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MULTIIMAGEINFO_H
-#define ARM_COMPUTE_MULTIIMAGEINFO_H
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-/** Store the multi-planar image's metadata */
-class MultiImageInfo
-{
-public:
-    /** Constructor */
-    MultiImageInfo();
-    /** Initialize the metadata structure with the given parameters
-     *
-     * @param[in] width  Width of the image (in number of pixels)
-     * @param[in] height Height of the image (in number of pixels)
-     * @param[in] format Colour format of the image.
-     */
-    void init(unsigned int width, unsigned int height, Format format);
-    /** Colour format of the image
-     *
-     * @return Colour format of the image
-     */
-    Format format() const;
-    /** Width in pixels
-     *
-     * @return The width in pixels
-     */
-    unsigned int width() const;
-    /** Height in pixels
-     *
-     * @return The height in pixels
-     */
-    unsigned int height() const;
-
-protected:
-    unsigned int _width;
-    unsigned int _height;
-    Format       _format;
-};
-}
-#endif /*ARM_COMPUTE_MULTIIMAGEINFO_H */
diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h
deleted file mode 100644
index 5d9c1ec1e2..0000000000
--- a/arm_compute/core/NEON/INESimpleKernel.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_INESIMPLEKERNEL_H
-#define ARM_COMPUTE_INESIMPLEKERNEL_H
-
-#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
-
-namespace arm_compute
-{
-/** Interface for simple NEON kernels having 1 tensor input and 1 tensor output */
-using INESimpleKernel = ICPPSimpleKernel;
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_INESIMPLEKERNEL_H */
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
deleted file mode 100644
index e4f4250d16..0000000000
--- a/arm_compute/core/NEON/NEAsymm.h
+++ /dev/null
@@ -1,760 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEASYMM_H
-#define ARM_COMPUTE_NEASYMM_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-using qasymm8x8_t   = uint8x8_t;   /**< 8 bit quantized asymmetric vector with 8 elements */
-using qasymm8x8x2_t = uint8x8x2_t; /**< 8 bit quantized asymmetric vector with 16 elements */
-using qasymm8x8x3_t = uint8x8x3_t; /**< 8 bit quantized asymmetric vector with 24 elements */
-using qasymm8x8x4_t = uint8x8x4_t; /**< 8 bit quantized asymmetric vector with 32 elements */
-using qasymm8x16_t  = uint8x16_t;  /**< 8 bit quantized asymmetric vector with 16 elements */
-
-using qasymm8x8_signed_t   = int8x8_t;   /**< 8 bit quantized signed asymmetric vector with 8 elements */
-using qasymm8x8x2_signed_t = int8x8x2_t; /**< 8 bit quantized signed asymmetric vector with 16 elements */
-using qasymm8x8x3_signed_t = int8x8x3_t; /**< 8 bit quantized signed asymmetric vector with 24 elements */
-using qasymm8x8x4_signed_t = int8x8x4_t; /**< 8 bit quantized signed asymmetric vector with 32 elements */
-using qasymm8x16_signed_t  = int8x16_t;  /**< 8 bit quantized signed asymmetric vector with 16 elements */
-
-/** Perform a multiply-accumulate on all 16 components of a QASYMM8 vector
- *
- * vd*vs + vo
- *
- * @param[in] vd Input vector value in QASYMM8 format
- * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
- * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
- *
- * @return A 16-component vector in QASYMM8 format, saturated to fit
- */
-uint8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo);
-
-/** Perform a multiply-accumulate on all 16 components of a QASYMM8_SIGNED vector
- *
- * vd*vs + vo
- *
- * @param[in] vd Input vector value in QASYMM8_SIGNED format
- * @param[in] vs Vector multiplier in F32 format. The multiplier value must be duplicated across all four lanes.
- * @param[in] vo Vector addend in F32 format. The addend value must be duplicated across all four lanes.
- *
- * @return A 16-component vector in QASYMM8_SIGNED format, saturated to fit
- */
-int8x16_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo);
-
-/** Performs final quantization step on 16 elements
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_u8                        Relu lower bound
- * @param max_u8                        Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-uint8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                 int          result_fixedpoint_multiplier,
-                                 int32_t      result_shift,
-                                 int32x4_t    result_offset_after_shift_s32,
-                                 uint8x16_t   min_u8,
-                                 uint8x16_t   max_u8)
-{
-    const static int32x4_t zero_s32 = vdupq_n_s32(0);
-
-    if(result_shift < 0)
-    {
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
-        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
-        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
-
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-
-        // Round to the nearest division by a power-of-two using result_shift_s32
-        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-        in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
-        in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
-    }
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Saturate negative values
-    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to U8
-    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_u8 = vmaxq_u8(out_u8, min_u8);
-        out_u8 = vminq_u8(out_u8, max_u8);
-    }
-
-    return out_u8;
-}
-
-/** Performs final quantization step on 16 elements
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-int8x16_t finalize_quantization(int32x4x4_t &in_s32,
-                                int          result_fixedpoint_multiplier,
-                                int32_t      result_shift,
-                                int32x4_t    result_offset_after_shift_s32,
-                                int8x16_t    min_s8,
-                                int8x16_t    max_s8)
-{
-    if(result_shift < 0)
-    {
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
-        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], (1 << (-result_shift)));
-        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], (1 << (-result_shift)));
-
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        in_s32.val[2] = vqrdmulhq_n_s32(in_s32.val[2], result_fixedpoint_multiplier);
-        in_s32.val[3] = vqrdmulhq_n_s32(in_s32.val[3], result_fixedpoint_multiplier);
-
-        // Round to the nearest division by a power-of-two using result_shift_s32
-        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-        in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift);
-        in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift);
-    }
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-/** Performs final quantization step on 16 elements for symmetric quantization
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param in_s32                        Input to be quantized.
- * @param result_fixedpoint_multiplier  Result multiplier parameter
- * @param result_shift                  Result shift parameter
- * @param result_offset_after_shift_s32 Result offset parameter
- * @param min_s8                        Relu lower bound
- * @param max_s8                        Relu upper bound
- *
- * @return Quantized values
- */
-template <bool   is_bounded_relu>
-inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
-                                            const int32x4x4_t &result_fixedpoint_multiplier,
-                                            const int32x4x4_t &result_shift,
-                                            const int32x4_t   &result_offset_after_shift_s32,
-                                            const int8x16_t   &min_s8,
-                                            const int8x16_t   &max_s8)
-{
-    const static int32x4_t one_s32 = vdupq_n_s32(1);
-
-    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    int32x4x4_t res_shift_gt0 =
-    {
-        vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
-        vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
-        vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
-        vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]),
-    };
-    // Round to the nearest division by a power-of-two using result_shift_s32
-    res_shift_gt0.val[0] = rounding_divide_by_pow2(res_shift_gt0.val[0], result_shift.val[0]);
-    res_shift_gt0.val[1] = rounding_divide_by_pow2(res_shift_gt0.val[1], result_shift.val[1]);
-    res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
-    res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
-
-    int32x4x4_t res_shift_lt0 =
-    {
-        vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
-        vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
-        vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
-        vmulq_s32(in_s32.val[3], vshlq_s32(one_s32, vnegq_s32(result_shift.val[3]))),
-    };
-    res_shift_lt0.val[0] = vqrdmulhq_s32(res_shift_lt0.val[0], result_fixedpoint_multiplier.val[0]);
-    res_shift_lt0.val[1] = vqrdmulhq_s32(res_shift_lt0.val[1], result_fixedpoint_multiplier.val[1]);
-    res_shift_lt0.val[2] = vqrdmulhq_s32(res_shift_lt0.val[2], result_fixedpoint_multiplier.val[2]);
-    res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
-
-    // Select result depending on shift value
-    const uint32x4x4_t mask_lt0 =
-    {
-#ifdef __aarch64__
-        vcltzq_s32(result_shift.val[0]),
-        vcltzq_s32(result_shift.val[1]),
-        vcltzq_s32(result_shift.val[2]),
-        vcltzq_s32(result_shift.val[3]),
-#else  //__aarch64__
-        vcltq_s32(result_shift.val[0], vdupq_n_s32(0)),
-        vcltq_s32(result_shift.val[1], vdupq_n_s32(0)),
-        vcltq_s32(result_shift.val[2], vdupq_n_s32(0)),
-        vcltq_s32(result_shift.val[3], vdupq_n_s32(0)),
-#endif //__aarch64__
-    };
-
-    in_s32.val[0] = vbslq_s32(mask_lt0.val[0], res_shift_lt0.val[0], res_shift_gt0.val[0]);
-    in_s32.val[1] = vbslq_s32(mask_lt0.val[1], res_shift_lt0.val[1], res_shift_gt0.val[1]);
-    in_s32.val[2] = vbslq_s32(mask_lt0.val[2], res_shift_lt0.val[2], res_shift_gt0.val[2]);
-    in_s32.val[3] = vbslq_s32(mask_lt0.val[3], res_shift_lt0.val[3], res_shift_gt0.val[3]);
-
-    // Add the offset terms
-    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
-    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
-    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
-    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
-
-    // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
-
-    // Convert S16 to S8
-    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s8 = vmaxq_s8(out_s8, min_s8);
-        out_s8 = vminq_s8(out_s8, max_s8);
-    }
-
-    return out_s8;
-}
-
-/** Performs final quantization step on single element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_value                      Input to be quantized.
- * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
- * @param[in] result_shift                  Result shift parameter
- * @param[in] result_offset_after_shift_s32 Result offset parameter
- * @param[in] min_u8                        Relu lower bound
- * @param[in] max_u8                        Relu upper bound
- *
- * @return Quantized value
- */
-template <bool is_bounded_relu>
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8)
-{
-    int32x4_t in_s32 = vdupq_n_s32(in_value);
-
-    if(result_shift < 0)
-    {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
-        // Shift value by result_shift_s32
-        in_value = rounding_divide_by_pow2(in_value, result_shift);
-    }
-
-    // Add the offset term
-    in_value += result_offset_after_shift_s32;
-
-    // Bound the result
-    uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
-    if(is_bounded_relu)
-    {
-        out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
-    }
-
-    return out_u8;
-}
-
-/** Performs final quantization step on single element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_value                      Input to be quantized.
- * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
- * @param[in] result_shift                  Result shift parameter
- * @param[in] result_offset_after_shift_s32 Result offset parameter
- * @param[in] min_s8                        Relu lower bound
- * @param[in] max_s8                        Relu upper bound
- *
- * @return Quantized value
- */
-template <bool is_bounded_relu>
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8)
-{
-    int32x4_t in_s32 = vdupq_n_s32(in_value);
-
-    if(result_shift < 0)
-    {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
-
-        // Shift value by result_shift_s32
-        in_value = rounding_divide_by_pow2(in_value, result_shift);
-    }
-
-    // Add the offset term
-    in_value += result_offset_after_shift_s32;
-
-    // Bound the result
-    int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-    if(is_bounded_relu)
-    {
-        out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
-    }
-
-    return out_s8;
-}
-
-/** Dequantize a neon vector holding 8 quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a neon vector holding 8 singed quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a neon vector holding 16 quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a neon vector holding 16 signed quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize following an asymmetric quantization scheme a neon vector holding 16 quantized values.
- *
- * @param[in] qv     Input values to be dequantized.
- * @param[in] scale  Quantization scaling factor.
- * @param[in] offset Zero quantization offset.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
-{
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize a vector of 16 values stored as signed asymmetric.
- *
- * @param[in] qv     Input values to be dequantized.
- * @param[in] scale  Quantization scaling factor.
- * @param[in] offset Zero quantization offset.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
-{
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize following symmetric quantization scheme a neon vector holding 16 quantized values.
- *
- * @param[in] qv     Input values to be dequantized.
- * @param[in] vscale Vector containing quantization scaling factors.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
-{
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Dequantize following a symmetric quantization scheme a neon vector holding 16 quantized values.
- *
- * @param[in] qv    Input values to be dequantized.
- * @param[in] scale Quantization scaling factor.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
-{
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Quantize a neon vector holding 8 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint8x8_t vquantize(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-}
-
-/** Quantize a neon vector holding 8 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the singed quantized values
- */
-inline int8x8_t vquantize_signed(const float32x4x2_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-}
-
-/** Quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint8x16_t vquantize(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-    return vcombine_u8(pa, pb);
-}
-
-/** Signed quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline int8x16_t vquantize_signed(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
-    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
-    return vcombine_s8(pa, pb);
-}
-
-/** Quantize to QASYMM16 a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float       scale     = qi.scale;
-    const int         offset    = qi.offset;
-    const float32x4_t voffset   = vdupq_n_f32(offset);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
-    const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
-    return { pa, pb };
-}
-} // namespace arm_compute
-#include "arm_compute/core/NEON/NEAsymm.inl"
-#endif // ARM_COMPUTE_NEASYMM_H
diff --git a/arm_compute/core/NEON/NEAsymm.inl b/arm_compute/core/NEON/NEAsymm.inl
deleted file mode 100644
index 71205e0403..0000000000
--- a/arm_compute/core/NEON/NEAsymm.inl
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-namespace arm_compute
-{
-inline qasymm8x16_t vmlaq_qasymm8(qasymm8x16_t vd, float32x4_t vs, float32x4_t vo)
-{
-    // Convert uint8 vectors to uint16 vectors
-    const uint8x8_t vd_low        = vget_low_u8(vd);
-    const uint8x8_t vd_high       = vget_high_u8(vd);
-    uint16x8_t      vd_low_u16x8  = vmovl_u8(vd_low);
-    uint16x8_t      vd_high_u16x8 = vmovl_u8(vd_high);
-    // Convert uint16 vectors to uint32 vectors
-    uint32x4_t A_u32x4 = vmovl_u16(vget_low_u16(vd_low_u16x8));
-    uint32x4_t B_u32x4 = vmovl_u16(vget_high_u16(vd_low_u16x8));
-    uint32x4_t C_u32x4 = vmovl_u16(vget_low_u16(vd_high_u16x8));
-    uint32x4_t D_u32x4 = vmovl_u16(vget_high_u16(vd_high_u16x8));
-    // Convert uint32 vectors to float32 vectors
-    float32x4_t A_f32x4 = vcvtq_f32_u32(A_u32x4);
-    float32x4_t B_f32x4 = vcvtq_f32_u32(B_u32x4);
-    float32x4_t C_f32x4 = vcvtq_f32_u32(C_u32x4);
-    float32x4_t D_f32x4 = vcvtq_f32_u32(D_u32x4);
-    // vd = vd*vs + vo
-    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
-    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
-    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
-    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
-    // Convert float32 vectors to uint32 vectors
-    A_u32x4 = vcvtq_u32_f32(A_f32x4);
-    B_u32x4 = vcvtq_u32_f32(B_f32x4);
-    C_u32x4 = vcvtq_u32_f32(C_f32x4);
-    D_u32x4 = vcvtq_u32_f32(D_f32x4);
-    // Convert uint32 vectors to uint16 vectors (with saturation)
-    vd_low_u16x8  = vcombine_u16(vqmovn_u32(A_u32x4), vqmovn_u32(B_u32x4));
-    vd_high_u16x8 = vcombine_u16(vqmovn_u32(C_u32x4), vqmovn_u32(D_u32x4));
-    // convert uint16 vectors to uint8 vectors (with saturation)
-    return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
-}
-inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
-{
-    // Convert uint8 vectors to int16 vectors
-    const int8x8_t vd_low        = vget_low_s8(vd);
-    const int8x8_t vd_high       = vget_high_s8(vd);
-    int16x8_t      vd_low_s16x8  = vmovl_s8(vd_low);
-    int16x8_t      vd_high_s16x8 = vmovl_s8(vd_high);
-    // Convert int16 vectors to int32 vectors
-    int32x4_t A_s32x4 = vmovl_s16(vget_low_s16(vd_low_s16x8));
-    int32x4_t B_s32x4 = vmovl_s16(vget_high_s16(vd_low_s16x8));
-    int32x4_t C_s32x4 = vmovl_s16(vget_low_s16(vd_high_s16x8));
-    int32x4_t D_s32x4 = vmovl_s16(vget_high_s16(vd_high_s16x8));
-    // Convert int32 vectors to float32 vectors
-    float32x4_t A_f32x4 = vcvtq_f32_s32(A_s32x4);
-    float32x4_t B_f32x4 = vcvtq_f32_s32(B_s32x4);
-    float32x4_t C_f32x4 = vcvtq_f32_s32(C_s32x4);
-    float32x4_t D_f32x4 = vcvtq_f32_s32(D_s32x4);
-    // vd = vd*vs + vo
-    A_f32x4 = vmlaq_f32(vo, A_f32x4, vs);
-    B_f32x4 = vmlaq_f32(vo, B_f32x4, vs);
-    C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
-    D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
-    // Convert float32 vectors to int32 vectors
-    A_s32x4 = vcvtq_s32_f32(A_f32x4);
-    B_s32x4 = vcvtq_s32_f32(B_f32x4);
-    C_s32x4 = vcvtq_s32_f32(C_f32x4);
-    D_s32x4 = vcvtq_s32_f32(D_f32x4);
-    // Convert int32 vectors to int16 vectors (with saturation)
-    vd_low_s16x8  = vcombine_s16(vqmovn_s32(A_s32x4), vqmovn_s32(B_s32x4));
-    vd_high_s16x8 = vcombine_s16(vqmovn_s32(C_s32x4), vqmovn_s32(D_s32x4));
-    // convert int16 vectors to int8 vectors (with saturation)
-    return vcombine_s8(vqmovn_s16(vd_low_s16x8), vqmovn_s16(vd_high_s16x8));
-}
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl
deleted file mode 100644
index 2cf52e58d2..0000000000
--- a/arm_compute/core/NEON/NEColorConvertHelper.inl
+++ /dev/null
@@ -1,1045 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/Utils.h"
-
-#include <arm_neon.h>
-
-namespace
-{
-#ifndef DOXYGEN_SKIP_THIS
-constexpr float red_coef_bt709    = 1.5748F;
-constexpr float green_coef_bt709  = -0.1873f;
-constexpr float green_coef2_bt709 = -0.4681f;
-constexpr float blue_coef_bt709   = 1.8556f;
-
-constexpr float rgb2yuv_bt709_kr = 0.2126f;
-constexpr float rgb2yuv_bt709_kb = 0.0722f;
-// K_g = 1 - K_r - K_b
-constexpr float rgb2yuv_bt709_kg = 0.7152f;
-// C_u = 1 / (2 * (1 - K_b))
-constexpr float rgb2yuv_bt709_cu = 0.5389f;
-// C_v = 1 / (2 * (1 - K_r))
-constexpr float rgb2yuv_bt709_cv = 0.6350f;
-
-constexpr float rgb2u8_red_coef   = 0.2126f;
-constexpr float rgb2u8_green_coef = 0.7152f;
-constexpr float rgb2u8_blue_coef  = 0.0722f;
-
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
-                                                const float rcoef, const float gcoef, const float bcoef)
-{
-    float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
-    greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
-    greyscale             = vmlaq_n_f32(greyscale, bcolor, bcoef);
-    return greyscale;
-}
-
-inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out)
-{
-    float32x4x4_t out_float32;
-
-    //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats
-    const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]);
-    const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]);
-    const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]);
-
-    //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) )
-    //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float
-    out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3],
-                                                      rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef);
-
-    //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s
-    arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
-}
-
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
-                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
-{
-    /*
-    Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
-    U'=-0.1146*R' - 0.3854*G' + 0.5000*B'
-    V'= 0.5000*R' - 0.4542*G' - 0.0458*B'
-    */
-    const auto c128 = vdupq_n_f32(128.f);
-
-    // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b
-    yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr);
-    yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg);
-    yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb);
-
-    // U = (B - Y) / (2 * (1 - K_b))
-    uvec = vsubq_f32(bvec, yvec);
-    uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu);
-
-    // V = (R - Y) / (2 * (1 - K_r))
-    vvec = vsubq_f32(rvec, yvec);
-    vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
-}
-
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
-                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
-{
-    float32x4x3_t rgb1, rgb2;
-
-    // Compute: cb - 128 and cr - 128;
-    const auto c128 = vdupq_n_f32(128.f);
-    uvec_val        = vsubq_f32(uvec_val, c128);
-    vvec_val        = vsubq_f32(vvec_val, c128);
-
-    // Compute:
-    // r = 0.0000f*f_u + 1.5748f*f_v;
-    // g = 0.1873f*f_u - 0.4681f*f_v;
-    // b = 1.8556f*f_u + 0.0000f*f_v;
-    const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
-    const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
-    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
-                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
-
-    // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
-    // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
-    // and written back to memory using vst3 instruction
-
-    rgb1.val[0] = vaddq_f32(yvec_val, red);
-    rgb1.val[1] = vaddq_f32(yvec_val, green);
-    rgb1.val[2] = vaddq_f32(yvec_val, blue);
-
-    rgb2.val[0] = vaddq_f32(yyvec_val, red);
-    rgb2.val[1] = vaddq_f32(yyvec_val, green);
-    rgb2.val[2] = vaddq_f32(yyvec_val, blue);
-
-    uint8x8x3_t u8_rgb;
-    arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
-
-    if(!alpha)
-    {
-        vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
-        vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
-        vst3_lane_u8(&output_ptr[6], u8_rgb, 1);
-        vst3_lane_u8(&output_ptr[9], u8_rgb, 5);
-        vst3_lane_u8(&output_ptr[12], u8_rgb, 2);
-        vst3_lane_u8(&output_ptr[15], u8_rgb, 6);
-        vst3_lane_u8(&output_ptr[18], u8_rgb, 3);
-        vst3_lane_u8(&output_ptr[21], u8_rgb, 7);
-    }
-    else
-    {
-        uint8x8x4_t u8_rgba;
-        u8_rgba.val[0] = u8_rgb.val[0];
-        u8_rgba.val[1] = u8_rgb.val[1];
-        u8_rgba.val[2] = u8_rgb.val[2];
-        u8_rgba.val[3] = vdup_n_u8(255);
-        vst4_lane_u8(&output_ptr[0], u8_rgba, 0);
-        vst4_lane_u8(&output_ptr[4], u8_rgba, 4);
-        vst4_lane_u8(&output_ptr[8], u8_rgba, 1);
-        vst4_lane_u8(&output_ptr[12], u8_rgba, 5);
-        vst4_lane_u8(&output_ptr[16], u8_rgba, 2);
-        vst4_lane_u8(&output_ptr[20], u8_rgba, 6);
-        vst4_lane_u8(&output_ptr[24], u8_rgba, 3);
-        vst4_lane_u8(&output_ptr[28], u8_rgba, 7);
-    }
-}
-
-inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha)
-{
-    uint8x16x3_t rgb;
-
-    if(alpha)
-    {
-        const auto tmp = vld4q_u8(ptr);
-        rgb.val[0]     = tmp.val[0];
-        rgb.val[1]     = tmp.val[1];
-        rgb.val[2]     = tmp.val[2];
-    }
-    else
-    {
-        rgb = vld3q_u8(ptr);
-    }
-
-    return rgb;
-}
-
-inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom)
-{
-    // Convert the uint8x16_t to float32x4x4_t
-    const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]);
-    const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]);
-    const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]);
-
-    const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]);
-    const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]);
-    const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]);
-
-    float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
-    float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
-
-    for(auto i = 0; i < 4; ++i)
-    {
-        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
-                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
-        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
-                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
-    }
-
-    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]);
-    arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
-}
-
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
-                              unsigned char *const __restrict out_uv)
-{
-    uint8x16x3_t vec_top, vec_bottom;
-    vec_top.val[0]    = rvec_top;
-    vec_top.val[1]    = gvec_top;
-    vec_top.val[2]    = bvec_top;
-    vec_bottom.val[0] = rvec_bottom;
-    vec_bottom.val[1] = gvec_bottom;
-    vec_bottom.val[2] = bvec_bottom;
-
-    rgb_to_yuv_conversion(vec_top, vec_bottom);
-
-    vst1q_u8(out_y_top, vec_top.val[0]);
-    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
-
-    const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]);
-    const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]);
-    const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]);
-    const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]);
-
-    uint8x8x2_t uvvec;
-    uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp));
-    uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp));
-
-    vst2_u8(out_uv, uvvec);
-}
-
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
-                              unsigned char *const __restrict out_u,
-                              unsigned char *const __restrict out_v)
-{
-    uint8x16x3_t vec_top, vec_bottom;
-    vec_top.val[0]    = rvec_top;
-    vec_top.val[1]    = gvec_top;
-    vec_top.val[2]    = bvec_top;
-    vec_bottom.val[0] = rvec_bottom;
-    vec_bottom.val[1] = gvec_bottom;
-    vec_bottom.val[2] = bvec_bottom;
-
-    rgb_to_yuv_conversion(vec_top, vec_bottom);
-
-    vst1q_u8(out_y_top, vec_top.val[0]);
-    vst1q_u8(out_y_bottom, vec_bottom.val[0]);
-
-    const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
-    const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
-    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
-                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
-
-    vst1_u8(out_u, vget_low_u8(uvvec));
-    vst1_u8(out_v, vget_high_u8(uvvec));
-}
-
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
-                              unsigned char *const __restrict out_y,
-                              unsigned char *const __restrict out_u,
-                              unsigned char *const __restrict out_v)
-{
-    // Convert the uint8x16_t to float32x4x4_t
-    const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec);
-    const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec);
-    const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
-
-    float32x4x4_t fyvec, fuvec, fvvec;
-    for(auto i = 0; i < 4; ++i)
-    {
-        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
-                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
-    }
-
-    uint8x16_t yvec, uvec, vvec;
-    arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec);
-    arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec);
-    arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec);
-
-    vst1q_u8(out_y, yvec);
-    vst1q_u8(out_u, uvec);
-    vst1q_u8(out_v, vvec);
-}
-#endif /* DOXYGEN_SKIP_THIS */
-}
-
-namespace arm_compute
-{
-/** Convert RGB to RGBX.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output RGBX buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld3q_u8(in.ptr());
-        uint8x16x4_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        ta2.val[3] = vdupq_n_u8(255);
-        vst4q_u8(out.ptr(), ta2);
-    },
-    in, out);
-}
-
-/** Convert RGB to U8.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output U8 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta1 = vld3q_u8(in.ptr());
-        uint8x16_t ta2;
-        rgb_to_u8_conversion(ta1, ta2);
-        vst1q_u8(out.ptr(), ta2);
-    },
-    in, out);
-}
-
-/** Convert RGBX to RGB.
- *
- * @param[in]  input  Input RGBX data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld4q_u8(in.ptr());
-        uint8x16x3_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        vst3q_u8(out.ptr(), ta2);
-    },
-    in, out);
-}
-
-/** Convert YUYV to RGB.
- *
- * @param[in]  input  Input YUYV data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool yuyv, bool alpha>
-void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    constexpr auto element_size = alpha ? 32 : 24;
-    constexpr auto shift        = yuyv ? 0 : 1;
-
-    Iterator in(input_ptr, win);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta = vld4q_u8(in.ptr());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
-        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
-        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
-        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
-
-        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-    },
-    in, out);
-}
-
-/** Convert NV12 to RGB.
- *
- * @param[in]  input  Input NV12 data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool uv, bool alpha>
-void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    constexpr auto element_size = alpha ? 32 : 24;
-    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
-    constexpr auto shift        = uv ? 0 : 1;
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_uv(input_ptr->plane(1), win_uv);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_uv, out);
-}
-
-/** Convert IYUV to RGB.
- *
- * @param[in]  input  Input IYUV data buffer.
- * @param[out] output Output RGB buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IImage *__restrict>(output);
-
-    constexpr auto element_size = alpha ? 32 : 24;
-    const auto     out_stride   = output_ptr->info()->strides_in_bytes().y();
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_u(input_ptr->plane(1), win_uv);
-    Iterator in_v(input_ptr->plane(2), win_uv);
-    Iterator out(output_ptr, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto *y_top_ptr    = in_y.ptr();
-        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
-        const auto *u_ptr        = in_u.ptr();
-        const auto *v_ptr        = in_v.ptr();
-
-        // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
-#if defined(__arch64__)
-        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
-        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
-        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
-        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
-        const auto ta_u         = vld1q_u8(u_ptr);
-        const auto ta_v         = vld1q_u8(v_ptr);
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
-#else  /* defined(__arch64__) */
-        const auto ta_y_top    = vld2q_u8(y_top_ptr);
-        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
-        const auto ta_u        = vld1q_u8(u_ptr);
-        const auto ta_v        = vld1q_u8(v_ptr);
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u.val[0] = U0 U2 U4 U6 ...
-        //ta_v.val[0] = V0 V2 V4 V6 ...
-
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
-#endif /* defined(__arch64__) */
-
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_u, in_v, out);
-}
-
-/** Convert YUYV to NV12.
- *
- * @param[in]  input  Input YUYV data buffer.
- * @param[out] output Output NV12 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool yuyv>
-void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = yuyv ? 0 : 1;
-
-    // NV12's UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_uv(output_ptr->plane(1), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16x2_t uvvec;
-        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst2q_u8(out_uv.ptr(), uvvec);
-    },
-    in, out_y, out_uv);
-}
-
-/** Convert IYUV to NV12.
- *
- * @param[in]  input  Input IYUV data buffer.
- * @param[out] output Output NV12 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_u(input_ptr->plane(1), win_uv);
-    Iterator in_v(input_ptr->plane(2), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_uv(output_ptr->plane(1), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        uint8x16x2_t ta_uv;
-        ta_uv.val[0] = vld1q_u8(in_u.ptr());
-        ta_uv.val[1] = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst2q_u8(out_uv.ptr(), ta_uv);
-    },
-    in_y, in_u, in_v, out_y, out_uv);
-}
-
-/** Convert NV12 to IYUV.
- *
- * @param[in]  input  Input NV12 data buffer.
- * @param[out] output Output IYUV buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool uv>
-void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = uv ? 0 : 1;
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_uv(input_ptr->plane(1), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win_uv);
-    Iterator out_v(output_ptr->plane(2), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
-        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
-}
-
-/** Convert YUYV to IYUV.
- *
- * @param[in]  input  Input YUYV data buffer.
- * @param[out] output Output IYUV buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool yuyv>
-void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = yuyv ? 0 : 1;
-
-    // Destination's UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win_uv);
-    Iterator out_v(output_ptr->plane(2), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
-
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
-
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
-
-        uint8x16_t uvec;
-        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        vst1q_u8(out_u.ptr(), uvec);
-
-        uint8x16_t vvec;
-        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst1q_u8(out_v.ptr(), vvec);
-    },
-    in, out_y, out_u, out_v);
-}
-
-/** Convert NV12 to YUV4.
- *
- * @param[in]  input  Input NV12 data buffer.
- * @param[out] output Output YUV4 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool uv>
-void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    constexpr auto shift = uv ? 0 : 1;
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_uv(input_ptr->plane(1), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win);
-    Iterator out_v(output_ptr->plane(2), win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_uv.val[0 + shift];
-        uvec.val[1] = ta_uv.val[0 + shift];
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_uv.val[1 - shift];
-        vvec.val[1] = ta_uv.val[1 - shift];
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
-}
-
-/** Convert IYUV to YUV4.
- *
- * @param[in]  input  Input IYUV data buffer.
- * @param[out] output Output YUV4 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IMultiImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in_y(input_ptr->plane(0), win);
-    Iterator in_u(input_ptr->plane(1), win_uv);
-    Iterator in_v(input_ptr->plane(2), win_uv);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win);
-    Iterator out_v(output_ptr->plane(2), win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_u        = vld1q_u8(in_u.ptr());
-        const auto ta_v        = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u = U0 U2 U4 U6 ...
-        //ta_v = V0 V2 V4 V6 ...
-
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_u;
-        uvec.val[1] = ta_u;
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
-
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_v;
-        vvec.val[1] = ta_v;
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_u, in_v, out_y, out_u, out_v);
-}
-
-/** Convert RGB to NV12.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output NV12 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_uv(output_ptr->plane(1), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_uv.ptr());
-    },
-    in, out_y, out_uv);
-}
-
-/** Convert RGB to IYUV.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output IYUV buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    // UV's width and height are subsampled
-    Window win_uv(win);
-    win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2));
-    win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1));
-    win_uv.validate();
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win_uv);
-    Iterator out_v(output_ptr->plane(2), win_uv);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
-}
-
-/** Convert RGB to YUV4.
- *
- * @param[in]  input  Input RGB data buffer.
- * @param[out] output Output YUV4 buffer.
- * @param[in]  win    Window for iterating the buffers.
- *
- */
-template <bool alpha>
-void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    win.validate();
-
-    const auto input_ptr  = static_cast<const IImage *__restrict>(input);
-    const auto output_ptr = static_cast<IMultiImage *__restrict>(output);
-
-    Iterator in(input_ptr, win);
-    Iterator out_y(output_ptr->plane(0), win);
-    Iterator out_u(output_ptr->plane(1), win);
-    Iterator out_v(output_ptr->plane(2), win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb = load_rgb(in.ptr(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
-
-        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
-                          out_y.ptr(), out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
-}
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
deleted file mode 100644
index 38701f434a..0000000000
--- a/arm_compute/core/NEON/NEKernels.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEKERNELS_H
-#define ARM_COMPUTE_NEKERNELS_H
-
-/* Header regrouping all the NEON kernels */
-#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
-#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h"
-#include "arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDilateKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h"
-#include "arm_compute/core/NEON/kernels/NEErodeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFloorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGatherKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NERemapKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReorgLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESelectKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEStridedSliceKernel.h"
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
-#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h"
-#include "arm_compute/core/NEON/kernels/NETileKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWarpKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
-
-#endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
deleted file mode 100644
index 8827bbf459..0000000000
--- a/arm_compute/core/NEON/NEMath.h
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMATH_H
-#define ARM_COMPUTE_NEMATH_H
-
-#include <arm_neon.h>
-#include <array>
-
-namespace arm_compute
-{
-/** Calculate floor of a vector.
- *
- * @param[in] val Input vector value in F32 format.
- *
- * @return The calculated floor vector.
- */
-float32x4_t vfloorq_f32(float32x4_t val);
-
-/** Calculate round value of a vector to nearest with ties to even.
- *
- * @param[in] val Input vector value in F32 format.
- *
- * @return The calculated round vector.
- */
-float32x4_t vroundq_rte_f32(float32x4_t val);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float32x2_t vinvsqrt_f32(float32x2_t x);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float32x4_t vinvsqrtq_f32(float32x4_t x);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float32x2_t vinv_f32(float32x2_t x);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float32x4_t vinvq_f32(float32x4_t x);
-
-/** Perform a 7th degree polynomial approximation using Estrin's method.
- *
- * @param[in] x      Input vector value in F32 format.
- * @param[in] coeffs Polynomial coefficients table.
- *
- * @return The calculated approximation.
- */
-float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs);
-
-/** Calculate exponential
- *
- * @param[in] x Input vector value in F32 format.
- *
- * @return The calculated exponent.
- */
-float32x4_t vexpq_f32(float32x4_t x);
-
-/** Calculate logarithm
- *
- * @param[in] x Input vector value in F32 format.
- *
- * @return The calculated logarithm.
- */
-float32x4_t vlogq_f32(float32x4_t x);
-
-/** Calculate hyperbolic tangent.
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @note We clamp x to [-5,5] to avoid overflowing issues.
- *
- * @param[in] val Input vector value in F32 format.
- *
- * @return The calculated Hyperbolic Tangent.
- */
-float32x4_t vtanhq_f32(float32x4_t val);
-
-/** Calculate n power of a number.
- *
- * pow(x,n) = e^(n*log(x))
- *
- * @param[in] val Input vector value in F32 format.
- * @param[in] n   Powers to raise the input to.
- *
- * @return The calculated power.
- */
-float32x4_t vpowq_f32(float32x4_t val, float32x4_t n);
-
-/** Round to the nearest division by a power-of-two using exponent
- *
- * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
- *
- * @param[in] x        Vector of 4 elements
- * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two
- *
- * @return the nearest division by a power-of-two using exponent
- */
-int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent);
-
-/** Round to the nearest division by a power-of-two using exponent
- *
- * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
- *
- * @param[in] x        Vector of 4 elements
- * @param[in] exponent Integer value used to round to nearest division by a power-of-two
- *
- * @return the nearest division by a power-of-two using exponent
- */
-int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent);
-
-/** Round to the nearest division by a power-of-two using exponent
- *
- * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
- *
- * @param[in] x        Element to divide.
- * @param[in] exponent Integer value used to round to nearest division by a power-of-two
- *
- * @return the nearest division by a power-of-two using exponent
- */
-int32_t rounding_divide_by_pow2(int32_t x, int exponent);
-
-/** Converts from uint8x16 to float32x4x4_t
- *
- * @param[in] in Vector of uint8 to be converted
- *
- * @return Converted vector of float
- */
-float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in);
-
-/** Converts from int8x16 to float32x4x4_t
- *
- * @param[in] in Vector of int8 to be converted
- *
- * @return Converted vector of float
- */
-float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in);
-
-/** Converts to float32x4x4_t from the specified templated 16 elements vectors
- *
- * @param[in] in Vector of float to be converted
- *
- * @return Converted vector of float
- */
-template <typename T>
-float32x4x4_t convert_to_float32x4x4(const T &in);
-
-/** Converts from two float32x4x3_t to just one uint8x8x3_t
- *
- * @param[in]  in1 First input vector of float to be converted
- * @param[in]  in2 Second input vector of float to be converted
- * @param[out] out Converted output vector uint8 to store the result
- */
-void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out);
-
-/** Converts from two float32x4x4_t to just one uint8x16_t
- *
- * @param[in]  in  Vector of float to be converted
- * @param[out] out Converted vector of uint8 to store the result
- */
-void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out);
-
-/** Converts from float32x4x4_t to just one int8x16_t
- *
- * @param[in]  in  Vector of float to be converted
- * @param[out] out Converted vector of uint8 to store the result
- */
-void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out);
-
-/** Calculate sine.
- *
- * @param[in] val Input vector value in radians, F32 format.
- *
- * @return The calculated sine.
- */
-float32x4_t vsinq_f32(float32x4_t val);
-
-/** Calculate sine.
- *
- * @param[in] val Input vector value in radians, F32 format.
- *
- * @return The calculated sine.
- */
-float32x2_t vsin_f32(float32x2_t val);
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Calculate hyperbolic tangent.
- *
- * tanh(x) = (e^2x - 1)/(e^2x + 1)
- *
- * @note We clamp x to [-5,5] to avoid overflowing issues.
- *
- * @param[in] val Input vector value in F16 format.
- *
- * @return The calculated Hyperbolic Tangent.
- */
-float16x8_t vtanhq_f16(float16x8_t val);
-
-/** Calculate round value of a vector to nearest with ties to even.
- *
- * @param[in] val Input vector value in F16 format.
- *
- * @return The calculated round vector.
- */
-float16x8_t vroundq_rte_f16(float16x8_t val);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float16x4_t vinv_f16(float16x4_t x);
-
-/** Calculate reciprocal.
- *
- * @param[in] x Input value.
- *
- * @return The calculated reciprocal.
- */
-float16x8_t vinvq_f16(float16x8_t x);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float16x4_t vinvsqrt_f16(float16x4_t x);
-
-/** Calculate inverse square root.
- *
- * @param[in] x Input value.
- *
- * @return The calculated inverse square root.
- */
-float16x8_t vinvsqrtq_f16(float16x8_t x);
-
-/** Calculate exponential
- *
- * @param[in] x Input vector value in F16 format.
- *
- * @return The calculated exponent.
- */
-float16x8_t vexpq_f16(float16x8_t x);
-
-/** Calculate n power of a number.
- *
- * pow(x,n) = e^(n*log(x))
- *
- * @param[in] val Input vector value in F16 format.
- * @param[in] n   Powers to raise the input to.
- *
- * @return The calculated power.
- */
-float16x8_t vpowq_f16(float16x8_t val, float16x8_t n);
-
-/** Calculate sine.
- *
- * @param[in] val Input vector value in radians, F16 format.
- *
- * @return The calculated sine.
- */
-float16x8_t vsinq_f16(float16x8_t val);
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#include "arm_compute/core/NEON/NEMath.inl"
-#endif /* ARM_COMPUTE_NEMATH_H */
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
deleted file mode 100644
index 032bfde238..0000000000
--- a/arm_compute/core/NEON/NEMath.inl
+++ /dev/null
@@ -1,529 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include <cmath>
-#include <limits>
-
-#ifndef M_PI
-#define M_PI (3.14159265358979323846)
-#endif // M_PI
-
-namespace arm_compute
-{
-/** Exponent polynomial coefficients */
-const std::array<float32x4_t, 8> exp_tab =
-{
-    {
-        vdupq_n_f32(1.f),
-        vdupq_n_f32(0.0416598916054f),
-        vdupq_n_f32(0.500000596046f),
-        vdupq_n_f32(0.0014122662833f),
-        vdupq_n_f32(1.00000011921f),
-        vdupq_n_f32(0.00833693705499f),
-        vdupq_n_f32(0.166665703058f),
-        vdupq_n_f32(0.000195780929062f),
-    }
-};
-
-/** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
-
-/** Sin polynomial coefficients */
-constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
-constexpr float te_sin_coeff3 = 0.05f;           // 1/(4*5)
-constexpr float te_sin_coeff4 = 0.023809523810f; // 1/(6*7)
-constexpr float te_sin_coeff5 = 0.013888888889f; // 1/(8*9)
-
-#ifndef DOXYGEN_SKIP_THIS
-inline float32x4_t vfloorq_f32(float32x4_t val)
-{
-    static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
-
-    const int32x4_t   z = vcvtq_s32_f32(val);
-    const float32x4_t r = vcvtq_f32_s32(z);
-
-    return vbslq_f32(vcgtq_f32(r, val), vsubq_f32(r, CONST_1), r);
-}
-
-inline float32x4_t vroundq_rte_f32(float32x4_t val)
-{
-#ifdef __aarch64__
-    return vrndnq_f32(val);
-#else  // __aarch64__
-    static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
-    static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
-    static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
-    const float32x4_t        floor_val        = vfloorq_f32(val);
-    const float32x4_t        diff             = vsubq_f32(val, floor_val);
-
-    /*
-    * Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
-    * This condition is checked by vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT) ,vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT) , vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT))))
-    */
-
-    return vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT), vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT), vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
-                     floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
-#endif // __aarch64__
-}
-
-inline float32x2_t vinvsqrt_f32(float32x2_t x)
-{
-    float32x2_t sqrt_reciprocal = vrsqrte_f32(x);
-    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmul_f32(vrsqrts_f32(vmul_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
-}
-
-inline float32x4_t vinvsqrtq_f32(float32x4_t x)
-{
-    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-
-    return sqrt_reciprocal;
-}
-
-inline float32x2_t vinv_f32(float32x2_t x)
-{
-    float32x2_t recip = vrecpe_f32(x);
-    recip             = vmul_f32(vrecps_f32(x, recip), recip);
-    recip             = vmul_f32(vrecps_f32(x, recip), recip);
-    return recip;
-}
-
-inline float32x4_t vinvq_f32(float32x4_t x)
-{
-    float32x4_t recip = vrecpeq_f32(x);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    recip             = vmulq_f32(vrecpsq_f32(x, recip), recip);
-    return recip;
-}
-
-inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array<float32x4_t, 8> &coeffs)
-{
-    float32x4_t A   = vmlaq_f32(coeffs[0], coeffs[4], x);
-    float32x4_t B   = vmlaq_f32(coeffs[2], coeffs[6], x);
-    float32x4_t C   = vmlaq_f32(coeffs[1], coeffs[5], x);
-    float32x4_t D   = vmlaq_f32(coeffs[3], coeffs[7], x);
-    float32x4_t x2  = vmulq_f32(x, x);
-    float32x4_t x4  = vmulq_f32(x2, x2);
-    float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4);
-    return res;
-}
-
-inline float32x4_t vexpq_f32(float32x4_t x)
-{
-    static const float32x4_t CONST_LN2          = vdupq_n_f32(0.6931471805f); // ln(2)
-    static const float32x4_t CONST_INV_LN2      = vdupq_n_f32(1.4426950408f); // 1/ln(2)
-    static const float32x4_t CONST_INF          = vdupq_n_f32(std::numeric_limits<float>::infinity());
-    static const float32x4_t CONST_MAX_INPUT    = vdupq_n_f32(88.7f);
-    static const float32x4_t CONST_0            = vdupq_n_f32(0.f);
-    static const int32x4_t   CONST_NEGATIVE_126 = vdupq_n_s32(-126);
-
-    // Perform range reduction [-log(2),log(2)]
-    int32x4_t   m   = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2));
-    float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2);
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, exp_tab);
-
-    // Reconstruct
-    poly = vreinterpretq_f32_s32(vqaddq_s32(vreinterpretq_s32_f32(poly), vqshlq_n_s32(m, 23)));
-    poly = vbslq_f32(vcltq_s32(m, CONST_NEGATIVE_126), CONST_0, poly); // Handle underflow
-    poly = vbslq_f32(vcgtq_f32(x, CONST_MAX_INPUT), CONST_INF, poly);  // Handle overflow
-
-    return poly;
-}
-
-inline float32x4_t vlogq_f32(float32x4_t x)
-{
-    static const int32x4_t   CONST_127 = vdupq_n_s32(127);           // 127
-    static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2)
-
-    // Extract exponent
-    int32x4_t   m   = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127);
-    float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23)));
-
-    // Polynomial Approximation
-    float32x4_t poly = vtaylor_polyq_f32(val, log_tab);
-
-    // Reconstruct
-    poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2);
-
-    return poly;
-}
-
-inline float32x4_t vtanhq_f32(float32x4_t val)
-{
-    static const float32x4_t CONST_1        = vdupq_n_f32(1.f);
-    static const float32x4_t CONST_2        = vdupq_n_f32(2.f);
-    static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f);
-    static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f);
-
-    float32x4_t x     = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
-    float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x));
-    float32x4_t num   = vsubq_f32(exp2x, CONST_1);
-    float32x4_t den   = vaddq_f32(exp2x, CONST_1);
-    float32x4_t tanh  = vmulq_f32(num, vinvq_f32(den));
-    return tanh;
-}
-
-inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n)
-{
-    return vexpq_f32(vmulq_f32(n, vlogq_f32(val)));
-}
-
-inline float32x4_t vsinq_f32(float32x4_t val)
-{
-    const float32x4_t pi_v   = vdupq_n_f32(M_PI);
-    const float32x4_t pio2_v = vdupq_n_f32(M_PI / 2);
-    const float32x4_t ipi_v  = vdupq_n_f32(1 / M_PI);
-
-    //Find positive or negative
-    const int32x4_t  c_v    = vabsq_s32(vcvtq_s32_f32(vmulq_f32(val, ipi_v)));
-    const uint32x4_t sign_v = vcleq_f32(val, vdupq_n_f32(0));
-    const uint32x4_t odd_v  = vandq_u32(vreinterpretq_u32_s32(c_v), vdupq_n_u32(1));
-
-    uint32x4_t neg_v = veorq_u32(odd_v, sign_v);
-
-    //Modulus a - (n * int(a*(1/n)))
-    float32x4_t      ma    = vsubq_f32(vabsq_f32(val), vmulq_f32(pi_v, vcvtq_f32_s32(c_v)));
-    const uint32x4_t reb_v = vcgeq_f32(ma, pio2_v);
-
-    //Rebase a between 0 and pi/2
-    ma = vbslq_f32(reb_v, vsubq_f32(pi_v, ma), ma);
-
-    //Taylor series
-    const float32x4_t ma2 = vmulq_f32(ma, ma);
-
-    //2nd elem: x^3 / 3!
-    float32x4_t elem = vmulq_f32(vmulq_f32(ma, ma2), vdupq_n_f32(te_sin_coeff2));
-    float32x4_t res  = vsubq_f32(ma, elem);
-
-    //3rd elem: x^5 / 5!
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff3));
-    res  = vaddq_f32(res, elem);
-
-    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff4));
-    res  = vsubq_f32(res, elem);
-
-    //5th elem: x^9 / 9!
-    elem = vmulq_f32(vmulq_f32(elem, ma2), vdupq_n_f32(te_sin_coeff5));
-    res  = vaddq_f32(res, elem);
-
-    //Change of sign
-    neg_v = vshlq_n_u32(neg_v, 31);
-    res   = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(res), neg_v));
-    return res;
-}
-
-inline float32x2_t vsin_f32(float32x2_t val)
-{
-    const float32x2_t pi_v   = vdup_n_f32(M_PI);
-    const float32x2_t pio2_v = vdup_n_f32(M_PI / 2);
-    const float32x2_t ipi_v  = vdup_n_f32(1 / M_PI);
-
-    //Find positive or negative
-    const int32x2_t  c_v    = vabs_s32(vcvt_s32_f32(vmul_f32(val, ipi_v)));
-    const uint32x2_t sign_v = vcle_f32(val, vdup_n_f32(0));
-    const uint32x2_t odd_v  = vand_u32(vreinterpret_u32_s32(c_v), vdup_n_u32(1));
-
-    uint32x2_t neg_v = veor_u32(odd_v, sign_v);
-
-    //Modulus a - (n * int(a*(1/n)))
-    float32x2_t      ma    = vsub_f32(vabs_f32(val), vmul_f32(pi_v, vcvt_f32_s32(c_v)));
-    const uint32x2_t reb_v = vcge_f32(ma, pio2_v);
-
-    //Rebase a between 0 and pi/2
-    ma = vbsl_f32(reb_v, vsub_f32(pi_v, ma), ma);
-
-    //Taylor series
-    const float32x2_t ma2 = vmul_f32(ma, ma);
-
-    //2nd elem: x^3 / 3!
-    float32x2_t elem = vmul_f32(vmul_f32(ma, ma2), vdup_n_f32(te_sin_coeff2));
-    float32x2_t res  = vsub_f32(ma, elem);
-
-    //3rd elem: x^5 / 5!
-    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff3));
-    res  = vadd_f32(res, elem);
-
-    //4th elem: x^7 / 7!float32x2_t vsin_f32(float32x2_t val)
-    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff4));
-    res  = vsub_f32(res, elem);
-
-    //5th elem: x^9 / 9!
-    elem = vmul_f32(vmul_f32(elem, ma2), vdup_n_f32(te_sin_coeff5));
-    res  = vadd_f32(res, elem);
-
-    //Change of sign
-    neg_v = vshl_n_u32(neg_v, 31);
-    res   = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(res), neg_v));
-    return res;
-}
-
-#endif /* DOXYGEN_SKIP_THIS */
-
-inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent)
-{
-    const int32x4_t shift_vec  = vnegq_s32(exponent);
-    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
-    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
-    return vrshlq_s32(fixed_up_x, shift_vec);
-}
-
-inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
-{
-    const int32x4_t shift_vec  = vdupq_n_s32(-exponent);
-    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
-    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
-    return vrshlq_s32(fixed_up_x, shift_vec);
-}
-
-inline int32_t rounding_divide_by_pow2(int32_t x, int exponent)
-{
-    const int32_t mask      = (1 << exponent) - 1;
-    const int32_t threshold = (mask >> 1) + (x < 0 ? 1 : 0);
-    return (x >> exponent) + ((x & mask) > threshold ? 1 : 0);
-}
-
-inline float32x4x4_t convert_uint8x16_to_float32x4x4(const uint8x16_t &in)
-{
-    float32x4x4_t out;
-
-    const auto tmp1 = vmovl_u8(vget_low_u8(in));
-    out.val[0]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
-    out.val[1]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
-
-    const auto tmp2 = vmovl_u8(vget_high_u8(in));
-    out.val[2]      = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2)));
-    out.val[3]      = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2)));
-    return out;
-}
-
-inline float32x4x4_t convert_int8x16_to_float32x4x4(const int8x16_t &in)
-{
-    float32x4x4_t out;
-
-    const auto tmp1 = vmovl_s8(vget_low_s8(in));
-    out.val[0]      = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp1)));
-    out.val[1]      = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp1)));
-
-    const auto tmp2 = vmovl_s8(vget_high_s8(in));
-    out.val[2]      = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp2)));
-    out.val[3]      = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp2)));
-    return out;
-}
-
-template <>
-inline float32x4x4_t convert_to_float32x4x4(const uint8x16_t &in)
-{
-    return convert_uint8x16_to_float32x4x4(in);
-}
-
-template <>
-inline float32x4x4_t convert_to_float32x4x4(const int8x16_t &in)
-{
-    return convert_int8x16_to_float32x4x4(in);
-}
-
-inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
-{
-    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
-    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
-    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
-}
-
-inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
-{
-    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
-                                  vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
-}
-
-inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
-{
-    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
-                                  vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Exponent polynomial coefficients */
-/** Logarithm polynomial coefficients */
-#ifndef DOXYGEN_SKIP_THIS
-inline float16x8_t vfloorq_f16(float16x8_t val)
-{
-    static const float16x8_t CONST_1 = vdupq_n_f16(1.f);
-
-    const int16x8_t   z = vcvtq_s16_f16(val);
-    const float16x8_t r = vcvtq_f16_s16(z);
-
-    return vbslq_f16(vcgtq_f16(r, val), vsubq_f16(r, CONST_1), r);
-}
-
-inline float16x8_t vroundq_rte_f16(float16x8_t val)
-{
-    return vrndnq_f16(val);
-}
-
-inline float16x4_t vinvsqrt_f16(float16x4_t x)
-{
-    float16x4_t sqrt_reciprocal = vrsqrte_f16(x);
-    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmul_f16(vrsqrts_f16(vmul_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    return sqrt_reciprocal;
-}
-
-inline float16x8_t vinvsqrtq_f16(float16x8_t x)
-{
-    float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    return sqrt_reciprocal;
-}
-
-inline float16x4_t vinv_f16(float16x4_t x)
-{
-    float16x4_t recip = vrecpe_f16(x);
-    recip             = vmul_f16(vrecps_f16(x, recip), recip);
-    recip             = vmul_f16(vrecps_f16(x, recip), recip);
-    return recip;
-}
-
-inline float16x8_t vinvq_f16(float16x8_t x)
-{
-    float16x8_t recip = vrecpeq_f16(x);
-    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
-    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
-    return recip;
-}
-
-inline float16x8_t vtanhq_f16(float16x8_t val)
-{
-    const float16x8_t CONST_1        = vdupq_n_f16(1.f);
-    const float16x8_t CONST_2        = vdupq_n_f16(2.f);
-    const float16x8_t CONST_MIN_TANH = vdupq_n_f16(-10.f);
-    const float16x8_t CONST_MAX_TANH = vdupq_n_f16(10.f);
-
-    const float16x8_t x     = vminq_f16(vmaxq_f16(val, CONST_MIN_TANH), CONST_MAX_TANH);
-    const float16x8_t exp2x = vexpq_f16(vmulq_f16(CONST_2, x));
-    const float16x8_t num   = vsubq_f16(exp2x, CONST_1);
-    const float16x8_t den   = vaddq_f16(exp2x, CONST_1);
-    const float16x8_t tanh  = vmulq_f16(num, vinvq_f16(den));
-    return tanh;
-}
-
-inline float16x8_t vtaylor_polyq_f16(float16x8_t x, const std::array<float16x8_t, 8> &coeffs)
-{
-    const float16x8_t A   = vaddq_f16(coeffs[0], vmulq_f16(coeffs[4], x));
-    const float16x8_t B   = vaddq_f16(coeffs[2], vmulq_f16(coeffs[6], x));
-    const float16x8_t C   = vaddq_f16(coeffs[1], vmulq_f16(coeffs[5], x));
-    const float16x8_t D   = vaddq_f16(coeffs[3], vmulq_f16(coeffs[7], x));
-    const float16x8_t x2  = vmulq_f16(x, x);
-    const float16x8_t x4  = vmulq_f16(x2, x2);
-    const float16x8_t res = vaddq_f16(vaddq_f16(A, vmulq_f16(B, x2)), vmulq_f16(vaddq_f16(C, vmulq_f16(D, x2)), x4));
-    return res;
-}
-
-inline float16x8_t vexpq_f16(float16x8_t x)
-{
-    // TODO (COMPMID-1535) : Revisit FP16 approximations
-    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
-    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
-
-    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vexpq_f32(x_low)), vcvt_f16_f32(vexpq_f32(x_high)));
-    return res;
-}
-
-inline float16x8_t vlogq_f16(float16x8_t x)
-{
-    // TODO (COMPMID-1535) : Revisit FP16 approximations
-    const float32x4_t x_high = vcvt_f32_f16(vget_high_f16(x));
-    const float32x4_t x_low  = vcvt_f32_f16(vget_low_f16(x));
-
-    const float16x8_t res = vcombine_f16(vcvt_f16_f32(vlogq_f32(x_low)), vcvt_f16_f32(vlogq_f32(x_high)));
-    return res;
-}
-
-inline float16x8_t vpowq_f16(float16x8_t val, float16x8_t n)
-{
-    // TODO (giaiod01) - COMPMID-1535
-    float32x4_t n0_f32   = vcvt_f32_f16(vget_low_f16(n));
-    float32x4_t n1_f32   = vcvt_f32_f16(vget_high_f16(n));
-    float32x4_t val0_f32 = vcvt_f32_f16(vget_low_f16(val));
-    float32x4_t val1_f32 = vcvt_f32_f16(vget_high_f16(val));
-
-    float32x4_t res0_f32 = vexpq_f32(vmulq_f32(n0_f32, vlogq_f32(val0_f32)));
-    float32x4_t res1_f32 = vexpq_f32(vmulq_f32(n1_f32, vlogq_f32(val1_f32)));
-
-    return vcombine_f16(vcvt_f16_f32(res0_f32), vcvt_f16_f32(res1_f32));
-}
-
-inline float16x8_t vsinq_f16(float16x8_t val)
-{
-    const float32x4_t val_high = vcvt_f32_f16(vget_high_f16(val));
-    const float32x4_t val_low  = vcvt_f32_f16(vget_low_f16(val));
-
-    const float32x4_t res_high = vsinq_f32(val_high);
-    const float32x4_t res_low  = vsinq_f32(val_low);
-
-    return vcombine_f16(vcvt_f16_f32(res_low), vcvt_f16_f32(res_high));
-}
-
-inline float16x4_t vsin_f16(float16x4_t val)
-{
-    const float32x4_t val_f32  = vcvt_f32_f16(val);
-    const float32x2_t val_high = vget_high_f32(val_f32);
-    const float32x2_t val_low  = vget_low_f32(val_f32);
-
-    const float32x2_t res_high = vsin_f32(val_high);
-    const float32x2_t res_low  = vsin_f32(val_low);
-
-    return vcvt_f16_f32(vcombine_f32(res_low, res_high));
-}
-
-#endif /* DOXYGEN_SKIP_THIS */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
diff --git a/arm_compute/core/NEON/NESymm.h b/arm_compute/core/NEON/NESymm.h
deleted file mode 100644
index d6c5a7073a..0000000000
--- a/arm_compute/core/NEON/NESymm.h
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESYMM_H
-#define ARM_COMPUTE_NESYMM_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-using qsymm8_t  = int8_t;  /**< 8 bit quantized symmetric scalar value */
-using qsymm16_t = int16_t; /**< 16 bit quantized symmetric scalar value */
-
-using qsymm16x8_t   = int16x8_t;   /**< 16 bit quantized symmetric vector with 8 elements */
-using qsymm16x8x2_t = int16x8x2_t; /**< 16 bit quantized symmetric vector with 16 elements */
-
-/** Performs final quantization step on 8 signed 16-bit elements
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_s32                       Input to be quantized.
- * @param[in] result_fixedpoint_multiplier Result multiplier parameter
- * @param[in] result_shift                 Result shift parameter
- * @param[in] min_s16                      Relu lower bound
- * @param[in] max_s16                      Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
-                                      int          result_fixedpoint_multiplier,
-                                      int32_t      result_shift,
-                                      int16x8_t    min_s16,
-                                      int16x8_t    max_s16)
-{
-    if(result_shift < 0)
-    {
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
-
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        in_s32.val[0] = vqrdmulhq_n_s32(in_s32.val[0], result_fixedpoint_multiplier);
-        in_s32.val[1] = vqrdmulhq_n_s32(in_s32.val[1], result_fixedpoint_multiplier);
-        // Round to the nearest division by a power-of-two using result_shift_s32
-        in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift);
-        in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift);
-    }
-
-    // Convert S32 to S16
-    int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
-
-    if(is_bounded_relu)
-    {
-        out_s16 = vmaxq_s16(out_s16, min_s16);
-        out_s16 = vminq_s16(out_s16, max_s16);
-    }
-
-    return out_s16;
-}
-
-/** Performs final quantization step on single signed 16-bit element
- *
- * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
- *
- * @param[in] in_value                     Input to be quantized.
- * @param[in] result_fixedpoint_multiplier Result multiplier parameter
- * @param[in] result_shift                 Result shift parameter
- * @param[in] min_s16                      Relu lower bound
- * @param[in] max_s16                      Relu upper bound
- *
- * @return Quantized values
- */
-template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
-                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
-{
-    if(result_shift < 0)
-    {
-        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
-    }
-    else
-    {
-        // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-        const int64_t in_64 = static_cast<int64_t>(in_value) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        // Shift value by result_shift_s32
-        in_value = rounding_divide_by_pow2(static_cast<int32_t>((in_64 + (1 << 30)) >> 31), result_shift);
-    }
-
-    // Bound the result
-    int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
-
-    if(is_bounded_relu)
-    {
-        out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
-    }
-
-    return out_s16;
-}
-
-/** Dequantize a neon vector holding 8 16-bit quantized values.
- *
- * @param[in] qv    Input values to be dequantized.
- * @param[in] scale Quantization scale
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
-{
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Quantize a neon vector holding 8 floating point values.
- *
- * @param[in] qv    Input values to be quantized.
- * @param[in] scale Quantization scale
- *
- * @return A neon vector holding the quantized values
- */
-inline int16x8_t vquantize_int16(const float32x4x2_t &qv, float scale)
-{
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-
-    const int32x4x2_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
-#else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
-#endif //__aarch64__
-        }
-    };
-    return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-}
-
-/** Dequantize a neon vector holding 16 16-bit quantized values.
- *
- * @param[in] qv Input values to be dequantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return Dequantized values in a neon vector
- */
-inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float         scale  = qi.scale;
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
-        }
-    };
-    return vdequantized_input;
-}
-
-/** Quantize a neon vector holding 16 floating point values.
- *
- * @param[in] qv Input values to be quantized.
- * @param[in] qi Quantization information to be used in the computation.
- *
- * @return A neon vector holding the quantized values
- */
-inline qsymm16x8x2_t vquantize_qsymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    const float scale = qi.scale;
-    ARM_COMPUTE_ERROR_ON(scale == 0.f);
-    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
-#ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
-#else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
-#endif //__aarch64__
-        }
-    };
-    const qsymm16x8x2_t res =
-    {
-        vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
-        vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
-    };
-
-    return res;
-}
-
-/** Multiply a neon vector using quantized multiplier and shift
- *
- * @param[in] input Input vector to mutiply values to be quantized.
- * @param[in] qmul  Quantized multipler
- * @param[in] shift Left bit shift
- *
- * @return A neon vector holding the multiplied value
- */
-inline int32x4x2_t multiply_by_quantized_multiplier_2row(int32x4x2_t input, int32_t qmul, int32_t shift)
-{
-    const auto left_shift  = shift > 0 ? shift : 0;
-    const auto right_shift = shift > 0 ? 0 : -shift;
-    const auto one_shifted = 1 << left_shift;
-
-    int32x4x2_t result;
-    result.val[0] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[0], one_shifted), qmul), right_shift);
-    result.val[1] = rounding_divide_by_pow2(vqrdmulhq_n_s32(vmulq_n_s32(input.val[1], one_shifted), qmul), right_shift);
-
-    return result;
-}
-
-} // namespace arm_compute
-#endif // ARM_COMPUTE_NESYMM_H
diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
deleted file mode 100644
index 7d35e40284..0000000000
--- a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the absolute difference kernel
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class NEAbsoluteDifferenceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAbsoluteDifferenceKernel";
-    }
-    /** Default constructor */
-    NEAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~NEAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16
-     * @param[out] output Destination tensor, Data types supported: U8/S16
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised absolute difference functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
-     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
-     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
-
-    /** Absolute difference function to use for the particular tensor formats passed to configure() */
-    AbsDiffFunction *_func;
-    const ITensor   *_input1;
-    const ITensor   *_input2;
-    ITensor         *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
deleted file mode 100644
index 367385dd7a..0000000000
--- a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEACCUMULATEKERNEL_H
-#define ARM_COMPUTE_NEACCUMULATEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the accumulate kernel
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class NEAccumulateKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateKernel";
-    }
-    /** Set the input and accumulation tensors
-     *
-     * @param[in]  input Source tensor. Data type supported: U8.
-     * @param[out] accum Destination tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-
-/** Interface for the accumulate weighted kernel
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class NEAccumulateWeightedKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateWeightedKernel";
-    }
-    /** Default constructor */
-    NEAccumulateWeightedKernel();
-    /** Set the input and accumulation tensors, and the scale value
-     *
-     * @param[in]     input Source tensor. Data type supported: U8.
-     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
-     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input, float alpha, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    float _alpha;
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Interface for the accumulate weighted kernel using F16 */
-class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateWeightedFP16Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Interface for the accumulate weighted kernel using F16 */
-using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Interface for the accumulate squared kernel
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class NEAccumulateSquaredKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateSquaredKernel";
-    }
-    /** Default constructor */
-    NEAccumulateSquaredKernel();
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data type supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]
-     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    uint32_t _shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEACCUMULATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
deleted file mode 100644
index 82103b988b..0000000000
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h>
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the activation layer kernel. */
-class NEActivationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEActivationLayerKernel";
-    }
-    /** Constructor */
-    NEActivationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEActivationLayerKernel(const NEActivationLayerKernel &) = delete;
-    /** Default move constructor */
-    NEActivationLayerKernel(NEActivationLayerKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete;
-    /** Default move assignment operator */
-    NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      activation_info Activation layer information.
-     */
-    void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
-     * @param[in] output   Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info Activation layer information.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using ActivationFunction = ActivationLayerInfo::ActivationFunction;
-    /** Common signature for all the specialised @ref NEActivationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-    activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qasymm8_signed_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qsymm16_t>::value, void>::type activation(const Window &window);
-
-private:
-    ITensor                      *_input;
-    ITensor                      *_output;
-    ActivationFunctionExecutorPtr _func;
-    ActivationLayerInfo           _act_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
deleted file mode 100644
index 36d257b886..0000000000
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
-#define ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform addition between two tensors */
-class NEArithmeticAdditionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEArithmeticAdditionKernel";
-    }
-    /** Default constructor */
-    NEArithmeticAdditionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default;
-    /** Default destructor */
-    ~NEArithmeticAdditionKernel() = default;
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
-     *   - (S16,S16)         -> S16
-     *   - (F16,F16)         -> F16
-     *   - (F32,F32)         -> F32
-     *   - (QASYMM8,QASYMM8) -> QASYMM8
-     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (QSYMM16,QSYMM16) -> QSYMM16
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in]  policy Overflow policy.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
-     *
-     * @param[in] input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] policy Overflow policy.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised add functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
-     * @param[in]  policy Overflow policy.
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window);
-    /** Add function to use for the particular tensor types passed to configure() */
-    AddFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
deleted file mode 100644
index f75c6bfb98..0000000000
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
-#define ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform subtraction between two tensors */
-class NEArithmeticSubtractionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEArithmeticSubtractionKernel";
-    }
-    /** Default constructor */
-    NEArithmeticSubtractionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default;
-    /** Default destructor */
-    ~NEArithmeticSubtractionKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * Valid configurations (Input1,Input2) -> Output :
-     *
-     *   - (U8,U8)                          -> U8
-     *   - (U8,U8)                          -> S16
-     *   - (QASYMM8, QASYMM8)               -> QASYMM8
-     *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,U8)                         -> S16
-     *   - (U8,S16)                         -> S16
-     *   - (S16,S16)                        -> S16
-     *   - (F16,F16)                        -> F16
-     *   - (F32,F32)                        -> F32
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
-     * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
-     *
-     * @note Convert policy cannot be WRAP if datatype is QASYMM8
-     *
-     * @param[in] input1 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2 An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
-     * @param[in] policy Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised sub functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  is_sat Flag to indicate if the policy is SATURATE.
-     */
-    using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat);
-    /** Sub function to use for the particular tensor types passed to configure() */
-    SubFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
deleted file mode 100644
index f943744ba0..0000000000
--- a/arm_compute/core/NEON/kernels/NEBatchConcatenateLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the batch concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEBatchConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBatchConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEBatchConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchConcatenateLayerKernel(const NEBatchConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchConcatenateLayerKernel &operator=(const NEBatchConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBatchConcatenateLayerKernel(NEBatchConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBatchConcatenateLayerKernel &operator=(NEBatchConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEBatchConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All.
-     * @param[in]     batch_offset The offset on axis # 3.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ITensor *input, unsigned int batch_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEBatchConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All.
-     * @param[in] batch_offset The offset on axis # 3.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int batch_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using BatchConcatFunction = void(const ITensor *in, ITensor *out, unsigned int batch_offset, const Window &window);
-
-private:
-    BatchConcatFunction *_func;
-    const ITensor       *_input;
-    ITensor             *_output;
-    unsigned int         _batch_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBATCHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
deleted file mode 100644
index d59ed7baf0..0000000000
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the batch normalization layer kernel.
- */
-class NEBatchNormalizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBatchNormalizationLayerKernel";
-    }
-    /** Default constructor */
-    NEBatchNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEBatchNormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is a nullptr, the batch normalization function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                          The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]      beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in]      gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
-     *
-     * @param[in] input    Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
-     *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in] beta     (Optional) Beta values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for beta is 0. Data types supported: Same as @p input
-     * @param[in] gamma    (Optional) Gamma values tensor info. 1 dimension with size equal to the feature maps [FM]. If not provided, default value for gamma is 1. Data types supported: Same as @p input
-     * @param[in] epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Configure execution function in case of non-fused activation **/
-    void configure_non_fused();
-    /** Configure execution function in case of fused activation **/
-    void configure_fused();
-
-    /** Template function to run batch normalization on fp16
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nchw(const Window &window);
-    /** Template function to run batch normalization on fp16 on tensors with NHWC format
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp16_nhwc(const Window &window);
-    /** Template function to run batch normalization on fp32
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nchw(const Window &window);
-    /** Template function to run batch normalization on fp32 on tensors with NHWC format
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     * @tparam F                Activation function functor to run
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation, typename F>
-    void batch_normalization_fp32_nhwc(const Window &window);
-    /** Common signature for all the batch normalization functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using BatchNormFunctionPtr = void (NEBatchNormalizationLayerKernel::*)(const Window &window);
-
-private:
-    BatchNormFunctionPtr _func;
-    ITensor             *_input;
-    ITensor             *_output;
-    const ITensor       *_mean;
-    const ITensor       *_var;
-    const ITensor       *_gamma;
-    const ITensor       *_beta;
-    float                _epsilon;
-    ActivationLayerInfo  _act_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
deleted file mode 100644
index 61e47b0ea4..0000000000
--- a/arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the batch to space kernel */
-class NEBatchToSpaceLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBatchToSpaceLayerKernel";
-    }
-    /** Default constructor */
-    NEBatchToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchToSpaceLayerKernel(const NEBatchToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBatchToSpaceLayerKernel &operator=(const NEBatchToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBatchToSpaceLayerKernel(NEBatchToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBatchToSpaceLayerKernel &operator=(NEBatchToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEBatchToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
-    /** Initialise the kernel's inputs and output (Static block shape).
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel (Static block shape).
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    const ITensor *_block_shape; /**< Block shape tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    DataLayout     _data_layout; /**< Data layout to  be used at run-time */
-
-    int32_t _block_shape_x;
-    int32_t _block_shape_y;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
deleted file mode 100644
index 7a777678dc..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISEANDKERNEL_H
-#define ARM_COMPUTE_NEBITWISEANDKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f]
- */
-class NEBitwiseAndKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseAndKernel";
-    }
-    /** Default constructor */
-    NEBitwiseAndKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input1 An input tensor. Data type supported: U8.
-     * @param[in]  input2 An input tensor. Data type supported: U8
-     * @param[out] output Output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input1; /**< Source tensor 1 */
-    const ITensor *_input2; /**< Source tensor 2 */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISEANDKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
deleted file mode 100644
index 3fb8c083a8..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISENOTKERNEL_H
-#define ARM_COMPUTE_NEBITWISENOTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise NOT operation
- *
- * Result is computed by:
- * @f[ output(x,y) = \lnot input(x,y) @f]
- */
-class NEBitwiseNotKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseNotKernel";
-    }
-    /** Default constructor */
-    NEBitwiseNotKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default;
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  input  An input tensor. Data type supported: U8.
-     * @param[out] output The output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;  /**< Source tensor */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISENOTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
deleted file mode 100644
index 5b532510ad..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISEORKERNEL_H
-#define ARM_COMPUTE_NEBITWISEORKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise inclusive OR between two tensors
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f]
- */
-class NEBitwiseOrKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseOrKernel";
-    }
-    /** Default constructor */
-    NEBitwiseOrKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input1 An input tensor. Data type supported: U8.
-     * @param[in]  input2 An input tensor. Data type supported: U8
-     * @param[out] output Output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input1; /**< Source tensor 1 */
-    const ITensor *_input2; /**< Source tensor 2 */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISEORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
deleted file mode 100644
index 0d9120501b..0000000000
--- a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBITWISEXORKERNEL_H
-#define ARM_COMPUTE_NEBITWISEXORKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors
- *
- * Result is computed by:
- * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f]
- */
-class NEBitwiseXorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBitwiseXorKernel";
-    }
-    /** Default constructor */
-    NEBitwiseXorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input1 An input tensor. Data type supported: U8.
-     * @param[in]  input2 An input tensor. Data type supported: U8
-     * @param[out] output The output tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input1; /**< Source tensor 1 */
-    const ITensor *_input2; /**< Source tensor 2 */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEBITWISEXORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
deleted file mode 100644
index e94f228f2a..0000000000
--- a/arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
-#define ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the bounding box kernel */
-class NEBoundingBoxTransformKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBoundingBoxTransformKernel";
-    }
-
-    /** Default constructor */
-    NEBoundingBoxTransformKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBoundingBoxTransformKernel(const NEBoundingBoxTransformKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBoundingBoxTransformKernel &operator=(const NEBoundingBoxTransformKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBoundingBoxTransformKernel(NEBoundingBoxTransformKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBoundingBoxTransformKernel &operator=(NEBoundingBoxTransformKernel &&) = default;
-    /** Default destructor */
-    ~NEBoundingBoxTransformKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                        Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input.
-     * @param[in]  info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
-     *
-     * @param[in] boxes      Source tensor info. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
-     * @param[in] pred_boxes Destination tensor info. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
-     * @param[in] deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
-     *                       Data types supported: QASYMM8 if @p input is QASYMM16, otherwise same as @p input.
-     * @param[in] info       Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
-     *
-     * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void internal_run(const Window &window);
-
-    const ITensor           *_boxes;
-    ITensor                 *_pred_boxes;
-    const ITensor           *_deltas;
-    BoundingBoxTransformInfo _bbinfo;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBOUNDINGBOXTRANSFORMKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
deleted file mode 100644
index 448e33be3c..0000000000
--- a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBOX3x3KERNEL_H
-#define ARM_COMPUTE_NEBOX3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a Box 3x3 filter */
-class NEBox3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBox3x3Kernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform a Box 3x3 filter for FP16 datatype
- */
-class NEBox3x3FP16Kernel : public NEBox3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBox3x3FP16Kernel";
-    }
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** NEON kernel to perform a Box 3x3 filter for FP16 datatype */
-using NEBox3x3FP16Kernel = NEBox3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBOX3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
deleted file mode 100644
index 1979c5bd2b..0000000000
--- a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECANNYEDGEKERNEL_H
-#define ARM_COMPUTE_NECANNYEDGEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Computes magnitude and quantised phase from inputs gradients. */
-class NEGradientKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGradientKernel";
-    }
-    /** Default constructor */
-    NEGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGradientKernel(const NEGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGradientKernel(NEGradientKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGradientKernel &operator=(NEGradientKernel &&) = default;
-    /** Default destructor */
-    virtual ~NEGradientKernel() = default;
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
-     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
-     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
-     */
-    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Common signature for all the specialised gradient functions
-     *
-     * @param[in]  gx_ptr        Pointer to the first input tensor.
-     * @param[in]  gy_ptr        Pointer to the second input tensor.
-     * @param[out] magnitude_ptr Pointer to the first output tensor
-     * @param[out] phase_ptr     Pointer to the second output tensor
-     */
-    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
-
-    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
-    const ITensor    *_gx;        /**< Source tensor - Gx component */
-    const ITensor    *_gy;        /**< Source tensor - Gy component */
-    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
-    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** NEON kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
- *
- * @note Hysteresis is computed in @ref NEEdgeTraceKernel
- */
-class NEEdgeNonMaxSuppressionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEEdgeNonMaxSuppressionKernel";
-    }
-    /** Default constructor */
-    NEEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
-    /** Default destructor */
-    ~NEEdgeNonMaxSuppressionKernel() = default;
-
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
-     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
-     * @param[in]  upper_thr        Upper threshold used for the hysteresis
-     * @param[in]  lower_thr        Lower threshold used for the hysteresis
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Common signature for all the specialised non-maxima suppression functions
-     *
-     * @param[in]  magnitude_ptr Pointer to the first input tensor.
-     * @param[in]  phase_ptr     Pointer to the second input tensor.
-     * @param[out] output_ptr    Pointer to the output tensor
-     * @param[in]  stride_mag    Stride of the magnitude tensor
-     * @param[in]  upper_thr     Upper threshold used for the hysteresis
-     * @param[in]  lower_thr     Lower threshold used for the hysteresis
-     */
-    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
-                                         const int32_t lower_thr);
-
-    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
-    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
-    ITensor                 *_output;    /**< Destination tensor */
-    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
-    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
-};
-
-/** NEON kernel to perform Edge tracing */
-class NEEdgeTraceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEEdgeTraceKernel";
-    }
-    /** Default constructor */
-    NEEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
-    /** Default constructor */
-    ~NEEdgeTraceKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
-     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
-     */
-    void configure(ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-    bool       is_parallelisable() const override;
-
-private:
-    ITensor *_input;  /**< Source tensor */
-    ITensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECANNYEDGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
deleted file mode 100644
index 8f019384d9..0000000000
--- a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the channel combine kernel */
-class NEChannelCombineKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelCombineKernel";
-    }
-    /** Default constructor */
-    NEChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelCombineKernel() = default;
-
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
-     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     */
-    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
-     */
-    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Combine 3 planes to form a three channel single plane tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_3C(const Window &win);
-    /** Combine 4 planes to form a four channel single plane tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_4C(const Window &win);
-    /** Combine 3 planes to form a single plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    template <bool is_yuyv>
-    void combine_YUV_1p(const Window &win);
-    /** Combine 3 planes to form a two plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_YUV_2p(const Window &win);
-    /** Combine 3 planes to form a three plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_YUV_3p(const Window &win);
-    /** Copies a full plane to the output tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void copy_plane(const Window &win, uint32_t plane_id);
-    /** Common signature for all the specialised ChannelCombine functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
-    /** ChannelCombine function to use for the particular tensor types passed to configure() */
-    ChannelCombineFunction _func;
-    std::array<const ITensor *, 4> _planes;
-    ITensor     *_output;
-    IMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-    unsigned int _num_elems_processed_per_iteration;
-    bool         _is_parallelizable;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
deleted file mode 100644
index 8d62016fe5..0000000000
--- a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the channel extract kernel */
-class NEChannelExtractKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelExtractKernel";
-    }
-    /** Default constructor */
-    NEChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelExtractKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Format supported: u8
-     */
-    void configure(const ITensor *input, Channel channel, ITensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar destination image. Format supported: U8
-     */
-    void configure(const IMultiImage *input, Channel channel, IImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Extract one channel from a two channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_2C_img(const Window &win);
-    /** Extract one channel from a three channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_3C_img(const Window &win);
-    /** Extract one channel from a four channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_4C_img(const Window &win);
-    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_YUYV_uv(const Window &win);
-    /** Copies a full plane to the output tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void copy_plane(const Window &win);
-    /** Common signature for all the specialised ChannelExtract functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
-    /** ChannelExtract function to use for the particular tensor types passed to configure() */
-    ChannelExtractFunction _func;
-    unsigned int           _lut_index;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h b/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
deleted file mode 100644
index 71659c4fcb..0000000000
--- a/arm_compute/core/NEON/kernels/NEChannelShuffleLayerKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
-#define ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the channel shuffle kernel */
-class NEChannelShuffleLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelShuffleLayerKernel";
-    }
-    /** Default constructor */
-    NEChannelShuffleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelShuffleLayerKernel(const NEChannelShuffleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelShuffleLayerKernel &operator=(const NEChannelShuffleLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelShuffleLayerKernel(NEChannelShuffleLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelShuffleLayerKernel &operator=(NEChannelShuffleLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelShuffleLayerKernel() = default;
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  input      Input tensor. Data types supported: All
-     * @param[out] output     Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     */
-    void configure(const ITensor *input, ITensor *output, unsigned int num_groups);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEChannelShuffleLayerKernel
-     *
-     * @param[in]  input      Input tensor. Data types supported: All
-     * @param[out] output     Output tensor. Data type supported: Same as @p input
-     * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _num_groups;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELSHUFFLELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
deleted file mode 100644
index 9aa1062622..0000000000
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
-#define ARM_COMPUTE_NECOL2IMKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform col2im reshaping.
- *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
- *
- * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
- *
- * @f[
- * \left( \begin{array}{ccccccccc}
- * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccc}
- * a0 & a1 & a2 \\
- * a3 & a4 & a5 \\
- * a6 & a7 & a8 \\
- * \end{array} \right)
- * @f]
- */
-class NECol2ImKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECol2ImKernel";
-    }
-    /** Default constructor */
-    NECol2ImKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECol2ImKernel(const NECol2ImKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECol2ImKernel &operator=(const NECol2ImKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECol2ImKernel(NECol2ImKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECol2ImKernel &operator=(NECol2ImKernel &&) = default;
-    /** Default destructor */
-    ~NECol2ImKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: All
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in]  convolved_dims Output convolved dimensions.
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
-    /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: All
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in] convolved_dims Output convolved dimensions.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the col2im
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_col2im(const Window &window);
-
-    /** Common signature for all the specialised col2im functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window);
-
-    Col2ImFunctionPtr _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    Size2D            _convolved_dims;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOL2IMKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
deleted file mode 100644
index 3059288ab4..0000000000
--- a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_COLORCONVERTKERNEL_H
-#define ARM_COMPUTE_COLORCONVERTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the color convert kernel */
-class NEColorConvertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEColorConvertKernel";
-    }
-    /** Default constructor */
-    NEColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEColorConvertKernel(NEColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
-    /** Default destructor */
-    ~NEColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const IMultiImage *input, IImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const IImage *input, IMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const IMultiImage *input, IMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
-    const void           *_input;
-    void                 *_output;
-    ColorConvertFunction *_func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOLORCONVERTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
deleted file mode 100644
index d45191949a..0000000000
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-#define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
- *
- * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
- *       - It follows a Convolution layer
- *       - The data layout used by the network does not match the one the model has been trained in.
- *
- * @note This function assumes the weights are already reshaped (transposed)
- */
-class NEConvertFullyConnectedWeightsKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvertFullyConnectedWeightsKernel";
-    }
-    /** Default constructor */
-    NEConvertFullyConnectedWeightsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvertFullyConnectedWeightsKernel(const NEConvertFullyConnectedWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvertFullyConnectedWeightsKernel &operator=(const NEConvertFullyConnectedWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvertFullyConnectedWeightsKernel(NEConvertFullyConnectedWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvertFullyConnectedWeightsKernel &operator=(NEConvertFullyConnectedWeightsKernel &&) = default;
-    /** Default destructor */
-    ~NEConvertFullyConnectedWeightsKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
-     * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in]  data_layout          The data layout the weights have been trained in.
-     */
-    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel
-     *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
-     * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
-     * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
-     * @param[in] data_layout          The data layout the weights have been trained in.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the permute
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_convert_fc_weights(const Window &window);
-
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _factor1; /*  equals to the number of elements per original input plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int   _factor2; /*  equals to the number of elements per original input plane if @p data_layout == NHWC; its number of channels otherwise */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
deleted file mode 100644
index 6ec2793484..0000000000
--- a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-#define ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** NEON kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class NEConvertQuantizedSignednessKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvertQuantizedSignednessKernel";
-    }
-    /** Default constructor */
-    NEConvertQuantizedSignednessKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output Destination tensor. Data types supported: opposite of @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output Destination tensor. Data types supported: opposite of @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
deleted file mode 100644
index 2b271de56b..0000000000
--- a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-#define ARM_COMPUTE_NECONVOLUTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-#include <array>
-#include <cstdint>
-#include <vector>
-
-namespace arm_compute
-{
-class ITensor;
-
-/****************************************************************************************\
- *                                    Square Convolution                                *
-\****************************************************************************************/
-
-/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9).
- * The client can supply a convolution matrix \f$ C_{m,n} \f$.
- * @f{eqnarray}{
- *  k_0 &=& \frac{m}{2}  \\
- *  l_0 &=& \frac{n}{2}  \\
- *  sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l}
- *  @f}
- *
- * @note The above equation for this function is similar to the default OpenCV Filter2D function,
- *       which actually computes a correlation and not a convolution.
- *       In case of a real convolution the convolution matrix should be flipped both horizontally and vertically.
- */
-template <unsigned int matrix_size>
-class NEConvolutionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionKernel";
-    }
-    /** Default constructor */
-    NEConvolutionKernel();
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    template <typename OutputType>
-    void convolution(const Window &win);
-
-protected:
-    uint32_t _scale;                                             /**< scale of the convolution */
-    std::array<int16_t, matrix_size *matrix_size> _convolution;  /**< convolution matrix */
-};
-
-/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/
-using NEConvolution3x3Kernel = NEConvolutionKernel<3>;
-/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/
-using NEConvolution5x5Kernel = NEConvolutionKernel<5>;
-/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/
-using NEConvolution7x7Kernel = NEConvolutionKernel<7>;
-///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/
-using NEConvolution9x9Kernel = NEConvolutionKernel<9>;
-
-/****************************************************************************************\
- *                              Separable Square Convolution                            *
-\****************************************************************************************/
-
-/** Kernel for the Horizontal pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionHorKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionHorKernel();
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data types supported: U16, S16, S32.
-     * @param[in]  conv_row         Convolution matrix to apply to the input tensor.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor..
-     *
-     * @param[in] window Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolve(const Window &window);
-
-    std::array<int16_t, matrix_size> _conv_row; /**< Convolution coefficients */
-    BorderSize _border_size;                    /**< Border size */
-};
-
-/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>;
-/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>;
-/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/
-using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>;
-
-/** Kernel for the Vertical pass of a Separable Convolution */
-template <unsigned int matrix_size>
-class NESeparableConvolutionVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESeparableConvolutionVertKernel";
-    }
-    /** Default constructor */
-    NESeparableConvolutionVertKernel();
-
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U16, S16, S32.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv_col         Convolution matrix to apply to the input tensor.
-     * @param[in]  scale            Scale of the convolution matrix
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as U16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_u16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S16.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s16(const Window &win);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *  This function is used if the intermediate values have been stored as S32.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType>
-    void convolution_s32(const Window &win);
-
-    std::array<int16_t, matrix_size> _conv_col; /**< Convolution coefficients */
-    uint32_t _scale;                            /**< Convolution's scale */
-};
-
-/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/
-using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>;
-/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/
-using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>;
-/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/
-using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>;
-
-/****************************************************************************************\
- *                                 Rectangle Convolution                                *
-\****************************************************************************************/
-
-/** Kernel for the running convolution on a rectangle matrix.
- *
- * @note Supports combinations of 3,5,7 and 9.
- */
-class NEConvolutionRectangleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEConvolutionRectangleKernel";
-    }
-    /** Default constructor */
-    NEConvolutionRectangleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor, Data types supported: U8, S16.
-     * @param[in]  conv             Convolution matrix to apply to the input tensor.
-     * @param[in]  width            Width of convolution matrix (Number of columns)
-     * @param[in]  height           Height of convolution matrix (Number of rows)
-     * @param[in]  scale            Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    unsigned int get_index(uint32_t val);
-    /** Apply the object's convolution to the given window of the input tensor.
-     *
-     * @param[in] win Window to apply the convolution on.
-     */
-    template <typename OutputType, unsigned int rows, unsigned int cols>
-    void convolution(const Window &win);
-
-protected:
-    const ITensor            *_input;       /**< Input tensor */
-    ITensor                  *_output;      /**< Output tensor */
-    uint32_t                  _scale;       /**< Scale of the convolution */
-    std::vector<int16_t>      _convolution; /**< Convolution matrix */
-    BorderSize                _border_size; /**< Calculated border width */
-    uint32_t                  _func_idx;    /**< Index used to specify convolution function to be used */
-    const static unsigned int _nr_supported_sizes
-    {
-        4
-    }; /**< Number of supported permutations */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECONVOLUTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECopyKernel.h b/arm_compute/core/NEON/kernels/NECopyKernel.h
deleted file mode 100644
index d2dbbaef98..0000000000
--- a/arm_compute/core/NEON/kernels/NECopyKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOPYKERNEL_H
-#define ARM_COMPUTE_NECOPYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a copy between two tensors */
-class NECopyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECopyKernel";
-    }
-    /** Default constructor */
-    NECopyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NECopyKernel(const NECopyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NECopyKernel &operator=(const NECopyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECopyKernel(NECopyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECopyKernel &operator=(NECopyKernel &&) = default;
-    /** Initialize the kernel's input, output.
-     *
-     * @param[in]  input   Source tensor. Data types supported: All
-     * @param[out] output  Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding (Optional) Padding to be applied to the input tensor
-     */
-    void configure(const ITensor *input, ITensor *output, const PaddingList &padding = PaddingList());
-    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
-     *
-     * @param[in] input   Source tensor. Data types supported: All
-     * @param[in] output  Destination tensor. Data types supported: same as @p input.
-     * @param[in] padding (Optional) Padding to be applied to the input tensor
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    PaddingList    _padding;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOPYKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECropKernel.h b/arm_compute/core/NEON/kernels/NECropKernel.h
deleted file mode 100644
index ba58ab1e58..0000000000
--- a/arm_compute/core/NEON/kernels/NECropKernel.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEON_CROP_KERNEL_H
-#define ARM_COMPUTE_NEON_CROP_KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor cropping */
-class NECropKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECropKernel";
-    }
-    /** Default constructor */
-    NECropKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECropKernel(const NECropKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECropKernel &operator=(const NECropKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECropKernel(NECropKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECropKernel &operator=(NECropKernel &&) = default;
-    /** Default destructor */
-    ~NECropKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     * @note Padding not supported.
-     *
-     * @param[in]  input               Source tensor. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
-     * @param[in]  crop_boxes          Tensor containing all possible boxes used to crop the image, each represented by 4 normalized values.
-     *                                 Data type supported: F32
-     * @param[in]  box_ind             One dimensional tensor mapping the @p crop_box_ind to the index of the 3D image in @p input.
-     *                                 Data type supported: F32
-     * @param[out] output              Destination tensor. Data type supported: F32
-     * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
-     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
-     */
-    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     * @note Padding not supported.
-     *
-     * @param[in] input               Source tensor info. Data type supported: U8/U16/S16/U32/S32/F16/F32. Data layouts supported: NHWC.
-     * @param[in] crop_boxes          Tensor info for tensor containing all possible boxes used to crop the image. Data type supported: F32
-     * @param[in] box_ind             Tensor info for the one dimensional tensor mapping the @p crop_box_ind to the index of the 3D image
-     *                                in @p input. Data type supported: F32
-     * @param[in] output              Destination tensor. Data type supported: F32
-     * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
-     * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
-
-    /** Configure output tensor's shape as this can only be determined at runtime. */
-    void configure_output_shape();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Function to use for in bounds crop for the particular tensor types passed to configure() */
-    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
-
-private:
-    const ITensor *_input;
-    const ITensor *_crop_boxes;
-    const ITensor *_box_ind;
-    ITensor       *_output;
-
-    Coordinates _start;
-    Coordinates _end;
-    uint32_t    _crop_box_ind;
-    float       _extrapolation_value;
-    /** The number of rows out of bounds at the start and end of output. */
-    std::array<uint32_t, 2> _rows_out_of_bounds;
-    /** The number of columns out of bounds at the start and end of output. */
-    std::array<uint32_t, 2> _cols_out_of_bounds;
-
-    NECropKernel::InBoundsCropFunction *_in_bounds_crop_function;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEON_CROP_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
deleted file mode 100644
index 52442c3920..0000000000
--- a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class IDistribution1D;
-class ILut;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the cumulative distribution (cummulative summmation) calculation kernel.
- *
- * This kernel calculates the cumulative sum of a given distribution (meaning that each output element
- * is the sum of all its previous elements including itself) and creates a lookup table with the normalized
- * pixel intensities which is used for improve the constrast of the image.
- */
-class NECumulativeDistributionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NECumulativeDistributionKernel";
-    }
-    /** Default constructor */
-    NECumulativeDistributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default;
-    /** Set the input and output distribution.
-     *
-     * @param[in]  input          Input image. Data type supported: U8
-     * @param[in]  distribution   Unnormalized 256-bin distribution of the input image.
-     * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution.
-     * @param[out] output         Equalization lookup table. Should consist of 256 entries of U8 elements.
-     */
-    void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    const IImage          *_input;          /**< Input image. */
-    const IDistribution1D *_distribution;   /**< Input histogram of the input image. */
-    IDistribution1D       *_cumulative_sum; /**< The cummulative distribution. */
-    ILut                  *_output;         /**< Output with the equalization lookup table. */
-private:
-    static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
deleted file mode 100644
index 6690ac2236..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
-#define ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the depth concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEDepthConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEDepthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConcatenateLayerKernel(const NEDepthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConcatenateLayerKernel &operator=(const NEDepthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDepthConcatenateLayerKernel(NEDepthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDepthConcatenateLayerKernel &operator=(NEDepthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDepthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]     depth_offset The offset on the Z axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     * @note: The output tensor's low two dimensions can't be smaller than the input one's.
-     * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2.
-     *
-     */
-    void configure(const ITensor *input, unsigned int depth_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEDepthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported:  QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] depth_offset The offset on the Z axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using DepthConcatFunction = void(const ITensor *in, ITensor *out, unsigned int depth_offset, const Window &window);
-
-private:
-    DepthConcatFunction *_func;
-    const ITensor       *_input;
-    ITensor             *_output;
-    unsigned int         _depth_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
deleted file mode 100644
index 5cda3203ed..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_DEPTHCONVERTKERNEL_H
-#define ARM_COMPUTE_DEPTHCONVERTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Depth conversion kernel
- *  This function ignores the scale and zeroPoint of quanized tensors, i.e. QASYMM8 input is treated as uint8 values.
- */
-class NEDepthConvertLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthConvertLayerKernel";
-    }
-    /** Default constructor*/
-    NEDepthConvertLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConvertLayerKernel(const NEDepthConvertLayerKernel &) = delete;
-    /** Default move constructor */
-    NEDepthConvertLayerKernel(NEDepthConvertLayerKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthConvertLayerKernel &operator=(const NEDepthConvertLayerKernel &) = delete;
-    /** Default move assignment operator */
-    NEDepthConvertLayerKernel &operator=(NEDepthConvertLayerKernel &&) = default;
-    /** Set the input and output of the kernel
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8        -> U16, S16, S32, F32, F16
-     *   - U8             -> U16, S16, S32, F32, F16
-     *   - U16            -> U8, U32
-     *   - S16            -> QASYMM8_SIGNED, U8, S32
-     *   - BFLOAT16       -> F32
-     *   - F16            -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32            -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32            -> QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8
-     *
-     * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
-     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
-     * @param[in]  policy Conversion policy.
-     * @param[in]  shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
-     */
-    void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthConvertLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
-     * @param[in] policy Conversion policy
-     * @param[in] shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
-    uint32_t       _shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
deleted file mode 100644
index 0b645887ee..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the depth to space kernel */
-class NEDepthToSpaceLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthToSpaceLayerKernel";
-    }
-    /** Default constructor */
-    NEDepthToSpaceLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthToSpaceLayerKernel(const NEDepthToSpaceLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthToSpaceLayerKernel &operator=(const NEDepthToSpaceLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDepthToSpaceLayerKernel(NEDepthToSpaceLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDepthToSpaceLayerKernel &operator=(NEDepthToSpaceLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDepthToSpaceLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape x value.
-     */
-    void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthToSpaceLayerKernel.
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    int32_t        _block_shape; /**< Block shape */
-    DataLayout     _data_layout; /**< Data layout of the operation */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
deleted file mode 100644
index 227ddb4743..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 depthwise convolution on a tensor. */
-class NEDepthwiseConvolutionLayer3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionLayer3x3Kernel";
-    }
-    /** Default constructor */
-    NEDepthwiseConvolutionLayer3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayer3x3Kernel(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayer3x3Kernel &operator=(const NEDepthwiseConvolutionLayer3x3Kernel &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionLayer3x3Kernel(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @note Supported data layouts: NCHW and NHWC
-     *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
-     *
-     * @note Supported data layouts: NCHW and NHWC
-     *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize     _border_size;
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_weights;
-    PadStrideInfo  _conv_info;
-    unsigned int   _num_elems_written_per_iteration;
-    unsigned int   _depth_multiplier;
-    Size2D         _dilation;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONKERNEL3x3_H */
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
deleted file mode 100644
index 9737c9932e..0000000000
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_neon.h>
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class NEDepthwiseConvolutionLayerNativeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionLayerNativeKernel";
-    }
-    /** Default constructor */
-    NEDepthwiseConvolutionLayerNativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayerNativeKernel(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionLayerNativeKernel &operator=(const NEDepthwiseConvolutionLayerNativeKernel &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionLayerNativeKernel(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionLayerNativeKernel &operator=(NEDepthwiseConvolutionLayerNativeKernel &&) = default;
-    /** Initialize the function's source, destination and parameters.
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in]  input            Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                              Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in]  biases           Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                   const Size2D &dilation = Size2D(1U, 1U));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerNativeKernel
-     *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input            Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases           Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    template < typename T, typename TW, int S, typename std::enable_if < std::is_same<T, float>::value
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                         || std::is_same<T, float16_t>::value
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                                                                         ,
-                                                                         int >::type = 0 >
-    void run_depthwise(const Window &window, bool has_biases);
-
-    template < typename T, typename TW, int S, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-    void run_depthwise(const Window &window, bool has_biases);
-
-    /** Common signature for all the specialised depthwise convolution native functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DepthwiseFunctionPtr = void (NEDepthwiseConvolutionLayerNativeKernel::*)(const Window &window, bool has_biases);
-
-    DepthwiseFunctionPtr _func;
-    BorderSize           _border_size;
-    const ITensor       *_input;
-    const ITensor       *_weights;
-    const ITensor       *_biases;
-    ITensor             *_output;
-    PadStrideInfo        _conv_info;
-    unsigned int         _depth_multiplier;
-    Size2D               _dilation;
-    std::vector<int>     _output_multiplier;
-    std::vector<int>     _output_shift;
-    bool                 _has_biases;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
deleted file mode 100644
index 3792fb3bd7..0000000000
--- a/arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the dequantization layer kernel. */
-class NEDequantizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDequantizationLayerKernel";
-    }
-    /** Default constructor */
-    NEDequantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDequantizationLayerKernel(const NEDequantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDequantizationLayerKernel &operator=(const NEDequantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEDequantizationLayerKernel(NEDequantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEDequantizationLayerKernel &operator=(NEDequantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDequantizationLayerKernel() = default;
-    /** Set input, output tensors.
-     *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] output Output tensor info. Data types supported: F16/F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDEQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
deleted file mode 100644
index 20aee9b5ce..0000000000
--- a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_NEDERIVATIVEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
- *
- */
-class NEDerivativeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDerivativeKernel";
-    }
-    /** Default constructor */
-    NEDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDerivativeKernel(NEDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform derivative along the X direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_x(const Window &window);
-    /** Function to perform derivative along the Y direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_y(const Window &window);
-    /** Function to perform derivative along the X and Y direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_xy(const Window &window);
-    /** Common signature for all the specialised derivative functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
-    /** Derivative function to use for the particular tensor types passed to configure() */
-    DerivativeFunction _func;
-
-private:
-    const ITensor *_input;    /**< Input tensor */
-    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
-    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDERIVATIVEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h
deleted file mode 100644
index 00a954d958..0000000000
--- a/arm_compute/core/NEON/kernels/NEDilateKernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDILATEKERNEL_H
-#define ARM_COMPUTE_NEDILATEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform boolean image dilatation */
-class NEDilateKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDilateKernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDILATEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
deleted file mode 100644
index 4ae283d69d..0000000000
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON interface for Direct Convolution Layer kernel */
-class NEDirectConvolutionLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDirectConvolutionLayerKernel";
-    }
-    /** Default constructor */
-    NEDirectConvolutionLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEDirectConvolutionLayerKernel() = default;
-    /** Set the input, weights, and output tensors.
-     *
-     * @note: DirectConvolution only works in the following configurations:
-     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
-     *
-     * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                       Data type supported:Same as @p input.
-     * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
-     *
-     * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_weights;
-    ITensor       *_output;
-    PadStrideInfo  _conv_info;
-    BorderSize     _border_size;
-    unsigned int   _kernel_size;
-    unsigned int   _num_weight_elems_read_per_row;
-    unsigned int   _num_elems_read_per_iteration;
-    unsigned int   _num_elems_written_per_iteration;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
deleted file mode 100644
index b7632d70c4..0000000000
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** NEON kernel to accumulate the biases, if provided, or downscale in case of quantized input.
- *
- * @note We assume bias to be shared
- * @note For quantized computations (i.e. @p input of S32 type) the output data type for auto-initialization must be passed as part
- *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
- */
-class NEDirectConvolutionLayerOutputStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDirectConvolutionLayerOutputStageKernel";
-    }
-    /** Default constructor */
-    NEDirectConvolutionLayerOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerOutputStageKernel(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDirectConvolutionLayerOutputStageKernel &operator=(const NEDirectConvolutionLayerOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerOutputStageKernel(NEDirectConvolutionLayerOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDirectConvolutionLayerOutputStageKernel &operator=(NEDirectConvolutionLayerOutputStageKernel &&) = default;
-    /** Default destructor */
-    ~NEDirectConvolutionLayerOutputStageKernel() = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                        Data type supported: F16/F32/S32
-     * @param[in]      bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
-     * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                        Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                        Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
-     * @param[in]      info   (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     */
-    void configure(ITensor *input, const ITensor *bias = nullptr, ITensor *output = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
-     *
-     * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                   Data type supported: F16/F32/S32
-     * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
-     * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                   Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                   Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p input is S32
-     * @param[in] info   (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using OutputStageKernel = void(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
-
-private:
-    OutputStageKernel *_func;
-    ITensor           *_input;
-    const ITensor     *_bias;
-    ITensor           *_output;
-    int                _result_fixedpoint_multiplier;
-    int                _result_shift;
-    int                _result_offset_after_shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYEROUTPUTSTAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
deleted file mode 100644
index 61c25e1a2a..0000000000
--- a/arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h
+++ /dev/null
@@ -1,214 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
-#define ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
- *
- */
-class NEElementwiseOperationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEElementwiseOperationKernel";
-    }
-    /** Default constructor */
-    NEElementwiseOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseOperationKernel(const NEElementwiseOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseOperationKernel &operator=(const NEElementwiseOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEElementwiseOperationKernel(NEElementwiseOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEElementwiseOperationKernel &operator=(NEElementwiseOperationKernel &&) = default;
-    /** Default destructor */
-    ~NEElementwiseOperationKernel() = default;
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Common signature for all the specialised arithmetic functions
-     *
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
-
-protected:
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Dependent on subclass.
-     */
-    static Status validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-
-    /** Commmon configure function for element-wise operators with no additional options (e.g. Min, Max, SquaredDiff)
-     *
-     */
-    void configure_common(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Function to use for the particular tensor types passed to configure() */
-    std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)> _function;
-
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-};
-
-class NEArithmeticOperationKernel : public NEElementwiseOperationKernel
-{
-public:
-    /** Default constructor */
-    NEArithmeticOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a Status
-     */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-
-class NEDivisionOperationKernel : public NEArithmeticOperationKernel
-{
-public:
-    /** Default constructor */
-    NEDivisionOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] input1 First tensor input. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-
-class NEPowerOperationKernel : public NEArithmeticOperationKernel
-{
-public:
-    /** Default constructor */
-    NEPowerOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in]  input1 First tensor input. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
-     *
-     * @param[in]  input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in]  input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-
-class NEComparisonOperationKernel : public NEElementwiseOperationKernel
-{
-public:
-    /** Default constructor */
-    NEComparisonOperationKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
-     *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor. Data types supported: U16/U32.
-     */
-    void configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
-     *
-     * @param[in] op     Comparison operation to be executed.
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
-     *
-     * @return a Status
-     */
-    static Status validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-protected:
-    // Inherited methods overridden:
-    static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h b/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
deleted file mode 100644
index 9a41cecf19..0000000000
--- a/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
-#define ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x) = OP(input(x))@f]
- *
- */
-class NEElementwiseUnaryKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEElementwiseUnaryKernel";
-    }
-    /** Default constructor */
-    NEElementwiseUnaryKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseUnaryKernel(const NEElementwiseUnaryKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseUnaryKernel &operator=(const NEElementwiseUnaryKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEElementwiseUnaryKernel(NEElementwiseUnaryKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEElementwiseUnaryKernel &operator=(NEElementwiseUnaryKernel &&) = default;
-    /** Default destructor */
-    ~NEElementwiseUnaryKernel() = default;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input. Data types supported: F16/F32.
-     * @param[in] output Output tensor. Data types supported: Same as @p input.
-     */
-    void configure(ElementWiseUnary op, const ITensor *input, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEElementwiseUnaryKernel
-     *
-     * @param[in] op     Arithmetic operation to be executed.
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a Status
-     */
-    static Status validate(ElementWiseUnary op, const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised arithmetic functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ElementwiseUnaryPtr = void (NEElementwiseUnaryKernel::*)(const Window &window);
-
-    /** Template function to run elementwise unary operation
-     *
-     * @tparam ScalarType Scalar datatype
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename ScalarType>
-    void elementwise_op(const Window &window);
-
-    ElementwiseUnaryPtr _func;
-    const ITensor      *_input;
-    ITensor            *_output;
-    ElementWiseUnary    _op;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEELEMENTWISEUNARYKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h
deleted file mode 100644
index e3fcc2847e..0000000000
--- a/arm_compute/core/NEON/kernels/NEErodeKernel.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEERODEKERNEL_H
-#define ARM_COMPUTE_NEERODEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform boolean image erosion */
-class NEErodeKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEErodeKernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEERODEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h b/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
deleted file mode 100644
index ed17e3b8d5..0000000000
--- a/arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
-#define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the digit reverse operation kernel. */
-class NEFFTDigitReverseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFFTDigitReverseKernel";
-    }
-    /** Constructor */
-    NEFFTDigitReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTDigitReverseKernel(const NEFFTDigitReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTDigitReverseKernel &operator=(const NEFFTDigitReverseKernel &) = delete;
-    /** Default Move Constructor. */
-    NEFFTDigitReverseKernel(NEFFTDigitReverseKernel &&) = default;
-    /** Default move assignment operator */
-    NEFFTDigitReverseKernel &operator=(NEFFTDigitReverseKernel &&) = default;
-    /** Default destructor */
-    ~NEFFTDigitReverseKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[out] output Destination tensor. Data type supported: same as @p input. Number of channels supported: 2 (complex tensor).
-     * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
-     * @param[in]  config Kernel configuration.
-     */
-    void configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTDigitReverseKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: 2 (complex tensor).
-     * @param[in] idx    Digit reverse index tensor info. Data type supported: U32
-     * @param[in] config Kernel configuration
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using NEFFTDigitReverseKernelFunctionPtr = void (NEFFTDigitReverseKernel::*)(const Window &window);
-
-    template <bool is_input_complex, bool is_conj>
-    void digit_reverse_kernel_axis_0(const Window &window);
-
-    template <bool is_input_complex, bool is_conj>
-    void digit_reverse_kernel_axis_1(const Window &window);
-
-    NEFFTDigitReverseKernelFunctionPtr _func;
-    const ITensor                     *_input;
-    ITensor                           *_output;
-    const ITensor                     *_idx;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h b/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
deleted file mode 100644
index 6e16fca0fb..0000000000
--- a/arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
-#define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <arm_neon.h>
-#include <set>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the FFT kernel. */
-class NEFFTRadixStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFFTRadixStageKernel";
-    }
-    /** Constructor */
-    NEFFTRadixStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTRadixStageKernel(const NEFFTRadixStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTRadixStageKernel &operator=(const NEFFTRadixStageKernel &) = delete;
-    /** Default Move Constructor. */
-    NEFFTRadixStageKernel(NEFFTRadixStageKernel &&) = default;
-    /** Default move assignment operator */
-    NEFFTRadixStageKernel &operator=(NEFFTRadixStageKernel &&) = default;
-    /** Default destructor */
-    ~NEFFTRadixStageKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note If the output tensor is nullptr, the FFT will be performed in-place
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[out]    output Destination tensor. Data type supported: same as @p input. Number of channels supported: same as @p input.
-     * @param[in]     config FFT descriptor metadata.
-     */
-    void configure(ITensor *input, ITensor *output, const FFTRadixStageKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTRadixStageKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: same as @p input.
-     * @param[in] config FFT descriptor metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config);
-    /** Returns the radix that are support by the FFT kernel
-     *
-     * @return A set of supported radix
-     */
-    static std::set<unsigned int> supported_radix();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor     *_input;
-    ITensor     *_output;
-    bool         _run_in_place;
-    unsigned int _Nx;
-    unsigned int _axis;
-    unsigned int _radix;
-
-    void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
-    void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
-
-    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
-    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int)>;
-
-    FFTFunctionPointerAxis0 _func_0;
-    FFTFunctionPointerAxis1 _func_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h b/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
deleted file mode 100644
index 72963fa56d..0000000000
--- a/arm_compute/core/NEON/kernels/NEFFTScaleKernel.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
-#define ARM_COMPUTE_NEFFTSCALEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "arm_compute/core/KernelDescriptors.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the inverse fft scale kernel. */
-class NEFFTScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFFTScaleKernel";
-    }
-    /** Constructor */
-    NEFFTScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTScaleKernel(const NEFFTScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFFTScaleKernel &operator=(const NEFFTScaleKernel &) = delete;
-    /** Default Move Constructor. */
-    NEFFTScaleKernel(NEFFTScaleKernel &&) = default;
-    /** Default move assignment operator */
-    NEFFTScaleKernel &operator=(NEFFTScaleKernel &&) = default;
-    /** Default destructor */
-    ~NEFFTScaleKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] input  Source tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[out]    output Destination tensor. Data type supported: same as @p input. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[in]     config Kernel configuration
-     */
-    void configure(ITensor *input, ITensor *output, const FFTScaleKernelInfo &config);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFFTScaleKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in] output Destination tensor info. Data type supported: same as @p input. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
-     * @param[in] config Kernel configuration
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTScaleKernelInfo &config);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor *_input;
-    ITensor *_output;
-    float    _scale;
-    bool     _run_in_place;
-    bool     _is_conj;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFFTSCALEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
deleted file mode 100644
index c0196c711a..0000000000
--- a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_NEFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** NEON kernel to perform fast corners */
-class NEFastCornersKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFastCornersKernel";
-    }
-    /** Constructor */
-    NEFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFastCornersKernel(NEFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data type supported: U8.
-     * @param[out] output              Output image. Data type supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const IImage *_input;               /**< source image */
-    IImage       *_output;              /**< inermediate results */
-    uint8_t       _threshold;           /**< threshold on difference between intensity */
-    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEFASTCORNERSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
deleted file mode 100644
index e45caec34b..0000000000
--- a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLARRAYKERNEL_H
-#define ARM_COMPUTE_NEFILLARRAYKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */
-class NEFillArrayKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillArrayKernel";
-    }
-    /** Default contructor */
-    NEFillArrayKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillArrayKernel(const NEFillArrayKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillArrayKernel(NEFillArrayKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default;
-    /** Default detructor */
-    ~NEFillArrayKernel() = default;
-
-    /** Initialise the kernel.
-     *
-     * @param[in]  input     Source image. Data type supported: U8.
-     * @param[in]  threshold Texels greater than the threshold will be added to the array.
-     * @param[out] output    Arrays of keypoints to store the results.
-     */
-    void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    const IImage   *_input;
-    IKeyPointArray *_output;
-    uint8_t         _threshold;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEFILLARRAYKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
deleted file mode 100644
index 0c852e8232..0000000000
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLBORDERKERNEL_H
-#define ARM_COMPUTE_NEFILLBORDERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to fill borders */
-class NEFillBorderKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillBorderKernel";
-    }
-    /** Default Constructor */
-    NEFillBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillBorderKernel(const NEFillBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillBorderKernel(NEFillBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEFillBorderKernel() = default;
-
-    /** Initialise the function.
-     *
-     * @note This kernel fills the borders within the XY-planes.
-     *
-     * @param[in,out] tensor                Tensor to process. Data types supported: All.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    void fill_replicate_single_channel(const Window &window);
-    void fill_constant_value_single_channel(const Window &window);
-
-    ITensor   *_tensor;
-    BorderSize _border_size;
-    BorderMode _mode;
-    PixelValue _constant_border_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFILLBORDERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
deleted file mode 100644
index 9c1059e606..0000000000
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H
-#define ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to fill the interior borders */
-class NEFillInnerBorderKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFillInnerBorderKernel";
-    }
-    /** Default constructor */
-    NEFillInnerBorderKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default;
-    /** Default destructor */
-    ~NEFillInnerBorderKernel() = default;
-
-    /** Initialise the function.
-     *
-     * @note This kernel fills the borders within the XY-planes.
-     *
-     * @param[in,out] input                 Tensor to process. Data types supported: U8/S16/S32/F32.
-     * @param[in]     border_size           Size of the border to fill in elements.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue());
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void fill_value_single_channel(const Window &window);
-
-    ITensor   *_tensor;
-    BorderSize _border_size;
-    PixelValue _constant_border_value;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h b/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
deleted file mode 100644
index ba2f99857f..0000000000
--- a/arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
-#define ARM_COMPUTE_NEFLATTENLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the flatten layer kernel. */
-class NEFlattenLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFlattenLayerKernel";
-    }
-    /** Default constructor */
-    NEFlattenLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFlattenLayerKernel(const NEFlattenLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFlattenLayerKernel &operator=(const NEFlattenLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFlattenLayerKernel(NEFlattenLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFlattenLayerKernel &operator=(NEFlattenLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEFlattenLayerKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFlattenLayerKernel
-     *
-     * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
-     *                    The dimensions above the third will be interpreted as batches. Data types supported: All
-     * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
-     *                    w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFLATTENLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFloorKernel.h b/arm_compute/core/NEON/kernels/NEFloorKernel.h
deleted file mode 100644
index 4cdd9f2ac0..0000000000
--- a/arm_compute/core/NEON/kernels/NEFloorKernel.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFLOORKERNEL_H
-#define ARM_COMPUTE_NEFLOORKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a floor operation */
-class NEFloorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFloorKernel";
-    }
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: F16/F32.
-     * @param[out] output Destination tensor. Same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFloorKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: F16/F32.
-     * @param[in] output Destination tensor info. Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFLOORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
deleted file mode 100644
index f598530d1e..0000000000
--- a/arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** OpenNE kernel to fuse the batch normalization node to a preceding convolution node */
-class NEFuseBatchNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFuseBatchNormalizationKernel";
-    }
-    /** Default constructor */
-    NEFuseBatchNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFuseBatchNormalizationKernel(const NEFuseBatchNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFuseBatchNormalizationKernel &operator=(const NEFuseBatchNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFuseBatchNormalizationKernel(NEFuseBatchNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFuseBatchNormalizationKernel &operator=(NEFuseBatchNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEFuseBatchNormalizationKernel() = default;
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
-     * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
-     * @param[out] fused_weights (Optional) Output fused weights tensor. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[out] fused_bias    (Optional) Output fused bias tensor. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in]  input_bias    (Optional) Input bias tensor for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in]  bn_beta       (Optional) Batch normalization layer beta tensor. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_beta is set to 0.0
-     * @param[in]  bn_gamma      (Optional) Batch normalization layer gamma tensor. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                           @note if nullptr, bn_gamma is set to 1.0
-     * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
-     *
-     * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
-     * @param[in] bn_mean       Batch normalization layer mean tensor info. Same as @p input_weights
-     * @param[in] bn_var        Batch normalization layer variance tensor info. Same as @p input_weights
-     * @param[in] fused_weights (Optional) Output fused weights tensor info. It can be a nullptr in case of in-place computation. Same as @p input_weights
-     * @param[in] fused_bias    (Optional) Output fused bias tensor info. It can be a nullptr in case of in-place computation and input_bias != nullptr. Same as @p input_weights
-     * @param[in] input_bias    (Optional) Input bias tensor info for convolution or depthwise convolution layer. It can be a nullptr in case the bias tensor is not required. Same as @p input_weights
-     * @param[in] bn_beta       (Optional) Batch normalization layer beta tensor info. It can be a nullptr in case the beta tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_beta is set to 0.0
-     * @param[in] bn_gamma      (Optional) Batch normalization layer gamma tensor info. It can be a nullptr in case the gamma tensor is not required. Same as @p input_weights
-     *                          @note if nullptr, bn_gamma is set to 1.0
-     * @param[in] epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
-     * @param[in] fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input_weights;
-    const ITensor *_input_bias;
-    const ITensor *_bn_mean;
-    const ITensor *_bn_var;
-    const ITensor *_bn_gamma;
-    const ITensor *_bn_beta;
-    ITensor       *_fused_weights;
-    ITensor       *_fused_bias;
-    float          _epsilon;
-    bool           _run_in_place_weights;
-    bool           _run_in_place_bias;
-
-    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
-
-    FuseBatchNormFunction *_func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEFUSEBATCHNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
deleted file mode 100644
index 6aa8e250a4..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Base class for GEMM NEON kernels implemented in Assembly. */
-class NEGEMMAssemblyBaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMAssemblyBaseKernel";
-    }
-    /** Constructor */
-    NEGEMMAssemblyBaseKernel()
-        : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
-    {
-    }
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
-
-    virtual ~NEGEMMAssemblyBaseKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * The computed function is C = a * AxB + b * C.
-     *
-     * @param[in]     input0          Input tensor containing the Matrix A. Data types supported: F32
-     * @param[in]     input1          Input tensor containing the Matrix B. Data types supported: same as @p input0
-     * @param[in,out] output          Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
-     * @param[out]    workspace       Space for intermediate results.
-     * @param[in]     alpha           Weight of the matrix product
-     * @param[in]     beta            Weight of the accumulation.
-     * @param[in]     is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
-     * @param[in]     is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
-    {
-        internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
-    }
-
-protected:
-    virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
-
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    ITensor       *_workspace;
-    float          _alpha;
-    float          _beta;
-    bool           _is_transposed_0;
-    bool           _is_transposed_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
deleted file mode 100644
index b6e6beab53..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
-#define ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to interleave the elements of a matrix
- *
- * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
- */
-class NEGEMMInterleave4x4Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMInterleave4x4Kernel";
-    }
-    /* Constructor */
-    NEGEMMInterleave4x4Kernel();
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the transpose functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: All
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window);
-
-    GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index 8f47c5089d..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to multiply matrices
- *
- * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel
- *  This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class NEGEMMLowpMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixMultiplyKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
-     * kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
-     *
-     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
-     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
-     * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    bool           _slide_matrix_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
deleted file mode 100644
index b069e4cfac..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to add the offset contribution after @ref NEGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (vector_sum_col[k] * a_offset) +
- *                   (vector_sum_row[i] * b_offset) +
- *                   (a_offset * b_offset * k)
- *
- */
-class NEGEMMLowpOffsetContributionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpOffsetContributionKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpOffsetContributionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionKernel(const NEGEMMLowpOffsetContributionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionKernel &operator=(const NEGEMMLowpOffsetContributionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionKernel(NEGEMMLowpOffsetContributionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionKernel &operator=(NEGEMMLowpOffsetContributionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in, out] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]      k              Number of matrix A columns or Matrix B rows
-     * @param[in]      a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]      b_offset       Offset to be added to each element of the matrix B.
-     */
-    void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
-     *
-     * @param[in] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_vector_sum_col;
-    const ITensor *_vector_sum_row;
-    ITensor       *_mm_result;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-    int32_t        _k_offset;
-    bool           _slide_vector_sum_col;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
deleted file mode 100644
index 0dc64c9842..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to add the offset contribution and perform the output stage after @ref NEGEMMLowpMatrixMultiplyKernel.
- *
- * The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * The output stage can perform either QuantizeDownInt32ToUint8Scale or QuantizeDownInt32ToUint8ScaleByFixedPoint for Uint8.
- * The output stage can perform either QuantizeDownInt32ToInt8Scale or QuantizeDownInt32ToInt8ScaleByFixedPoint for Int8.
- *
- * For QuantizeDownInt32ToUint8Scale/QuantizeDownInt32ToInt8Scale the final result is:
- *
- * ((mm_result'[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * For QuantizeDownInt32ToUint8ScaleByFixedPoint/QuantizeDownInt32ToInt8ScaleByFixedPoint the final result is:
- *
- * (FixedPointMul(mm_result'[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * and mm_result'[i][k] = mm_result[i][k] +
- *                        (vector_sum_col[k] * a_offset) +
- *                        (vector_sum_row[i] * b_offset) +
- *                        (a_offset * b_offset * k)
- */
-
-class NEGEMMLowpOffsetContributionOutputStageKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpOffsetContributionOutputStageKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpOffsetContributionOutputStageKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionOutputStageKernel(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(const NEGEMMLowpOffsetContributionOutputStageKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionOutputStageKernel(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpOffsetContributionOutputStageKernel &operator=(NEGEMMLowpOffsetContributionOutputStageKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]  vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     * @param[in]  bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[out] output         Output tensor containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in]  k              Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     */
-    void configure(const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   GEMMLowpOutputStageInfo output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionOutputStageKernel
-     *
-     * @param[in] mm_result      Input tensor info containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in] vector_sum_col Tensor info for the input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Tensor info for the input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
-     * @param[in] output         Output tensor info containing the final quantized result. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     * @param[in] output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t                 b_offset,
-                           GEMMLowpOutputStageInfo output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    using NEGEMMLowpOffsetContributionOutputStageFunction = std::function<void(const Window, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                                               ITensor *, int32_t, int32_t, int32_t, bool, GEMMLowpOutputStageInfo)>;
-
-private:
-    /** Function to use for the particular tensors passed to configure() */
-    NEGEMMLowpOffsetContributionOutputStageFunction _function;
-    const ITensor                                  *_vector_sum_col;
-    const ITensor                                  *_vector_sum_row;
-    const ITensor                                  *_bias;
-    const ITensor                                  *_mm_result;
-    ITensor                                        *_output;
-    int32_t                                         _a_offset;
-    int32_t                                         _b_offset;
-    int32_t                                         _k_offset;
-    bool                                            _slide_vector_sum_col;
-    GEMMLowpOutputStageInfo                         _output_stage;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPOFFSETCONTRIBUTIONOUTPUTSTAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
deleted file mode 100644
index b4a1419c9b..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *  -#  -to the [0..255] range and cast to QASYMM8.
- *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ScaleKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ScaleKernel(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(const NEGEMMLowpQuantizeDownInt32ScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ScaleKernel &operator=(NEGEMMLowpQuantizeDownInt32ScaleKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo *output_stage);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in]  input        Input tensor. Data type supported: S32
-     * @param[in]  bias         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in]  output       Output tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output_stage GEMMLowp output stage metadata.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ScaleKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ScaleKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ScaleKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr        _func;
-    const ITensor                 *_input;
-    const ITensor                 *_bias;
-    ITensor                       *_output;
-    const GEMMLowpOutputStageInfo *_output_stage;
-    bool                           _is_bounded_relu;
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32SCALEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
deleted file mode 100644
index 0806bd1df5..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor info with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to 0.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT16SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
deleted file mode 100644
index 2b3657c728..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _result_offset_after_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
deleted file mode 100644
index 2f099a3ebb..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
- *
- * This kernel takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- */
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel";
-    }
-    /** Constructor */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(const NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &operator=(NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] input  Input tensor. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool is_bounded_relu>
-    void run(const Window &window);
-
-    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(const Window &window);
-
-    QuantizeDownFunctionPtr _func;
-    const ITensor          *_input;
-    const ITensor          *_bias;
-    ITensor                *_output;
-    int                     _result_fixedpoint_multiplier;
-    int                     _result_shift;
-    int                     _result_offset_after_shift;
-    int                     _min;
-    int                     _max;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFIXEDPOINTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
deleted file mode 100644
index 1e472f5252..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-struct GEMMLowpReductionKernelInfo;
-
-/** Common interface for all NEON reduction kernels */
-class INEGEMMLowpReductionKernel : public INEKernel
-{
-public:
-    /** Constructor */
-    INEGEMMLowpReductionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    INEGEMMLowpReductionKernel(const INEGEMMLowpReductionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    INEGEMMLowpReductionKernel &operator=(const INEGEMMLowpReductionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMLowpReductionKernel(INEGEMMLowpReductionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMLowpReductionKernel &operator=(INEGEMMLowpReductionKernel &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
-     * @param[in]  info   Kernel metadata:
-     *                    - k            Number of matrix columns/rows depending on the type of reduction.
-     *                    - is_reshaped  True if the matrix has been reshaped.
-     *                    - scalar       Scalar value to multiply each reduced column/row by.
-     *                    - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
-     */
-    virtual void configure(const ITensor *input, ITensor *output, const GEMMLowpReductionKernelInfo &info) = 0;
-
-protected:
-    const ITensor *_input;
-    ITensor       *_output;
-    int32_t        _k;
-    bool           _is_reshaped;
-    int32_t        _scalar;
-    bool           _mul_by_scalar;
-};
-
-/** NEON kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixAReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixAReductionKernel";
-    }
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            (num_mtx_a_cols) Number of matrix A columns
-     *                            - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     */
-    void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
-     *
-     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            (num_mtx_a_cols) Number of matrix A columns
-     *                           - is_reshaped  (is_interleaved4x4) True if the matrix A has been interleaved4x4
-     *                           - scalar       Scalar value to multiply each reduced row by.
-     *                           - mul_byscalar True if each reduced column must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] window Execution window
-     */
-    template <typename T>
-    void run_internal(const Window &window);
-};
-
-/** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- */
-class NEGEMMLowpMatrixBReductionKernel : public INEGEMMLowpReductionKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMLowpMatrixBReductionKernel";
-    }
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in]  info           Kernel metadata:
-     *                            - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                            - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                            - scalar       Scalar value to multiply each reduced row by.
-     *                            - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     */
-    void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
-     *
-     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
-     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
-     * @param[in] info           Kernel metadata:
-     *                           - k            (num_mtx_b_rows) Number of matrix B rows.
-     *                           - is_reshaped  (is_transposed1xW) True if the input tensor is transposed 1xW.
-     *                           - scalar       Scalar value to multiply each reduced row by.
-     *                           - mul_byscalar True if each reduced row must be multiplied by a scalar value.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Execution of the reduction kernel specialized on the input type
-     *
-     * @param[in] window Execution window
-     * @param[in] info   Thread-related information
-     */
-    template <typename T>
-    void run_internal(const Window &window, const ThreadInfo &info);
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEGEMMLOWREDUCTIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
deleted file mode 100644
index a3ba57e4ab..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-/** NEON kernel to add a bias to each row of the input tensor */
-class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixAccumulateBiasesKernel";
-    }
-    /** Default constructor */
-    NEGEMMMatrixAccumulateBiasesKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
-    /** Default destructor */
-    ~NEGEMMMatrixAccumulateBiasesKernel() = default;
-    /** Set the accumulate buffer and the biases of the kernel.
-     *
-     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
-     * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
-     */
-    void configure(ITensor *accum, const ITensor *biases);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel
-     *
-     * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
-     * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    ITensor       *_accum;
-    const ITensor *_biases;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
deleted file mode 100644
index e528c59d8f..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size
- *
- * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have:
- *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel
- *        - MTX_1 = C
- */
-class NEGEMMMatrixAdditionKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixAdditionKernel";
-    }
-    /** Constructor */
-    NEGEMMMatrixAdditionKernel();
-    /** Prevent instances of this class from being copied */
-    NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
-     * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
-     * @param[in]      beta   Weight of matrix C
-     */
-    void configure(const ITensor *input, ITensor *output, float beta);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAdditionKernel.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in] input  Input tensor info (Matrix C). Data types supported: F16/F32
-     * @param[in] output Output tensor info. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
-     * @param[in] beta   Weight of matrix C
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the matrix addition functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: F16/F32
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     * @param[in]  beta   Weight of matrix C
-     */
-    using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta);
-    /** Matrix addition function to use for the particular tensor types passed to configure() */
-    MatrixAdditionFunction *_func;
-    float                   _beta;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
deleted file mode 100644
index 841e08d0ef..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication
- *
- * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel
- * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped
- *
- */
-class NEGEMMMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixMultiplyKernel";
-    }
-    /** Constructor */
-    NEGEMMMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     *       These two kernels change the layout of the original matrices to be more cache-friendly.
-     *
-     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in]  input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                            If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[out] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha          Weight of the matrix product
-     * @param[in]  is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
-     *
-     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
-     * @param[in] input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
-     *                           If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
-     * @param[in] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] alpha          Weight of the matrix product
-     * @param[in] is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
-     * @param[in] reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-    float          _alpha;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
deleted file mode 100644
index f5635dd58c..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
-#define ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the GEMM matrix vector multiply kernel. **/
-class NEGEMMMatrixVectorMultiplyKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMMatrixVectorMultiplyKernel";
-    }
-    /** Default constructor */
-    NEGEMMMatrixVectorMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixVectorMultiplyKernel(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGEMMMatrixVectorMultiplyKernel &operator=(const NEGEMMMatrixVectorMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixVectorMultiplyKernel(NEGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in]  input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
-     *
-     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] input1 Second Input tensor. Data types supported: same as @p input.
-     * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Template function to run the matrix vector multiplication
-     *
-     * @tparam I0 Input 0 type
-     * @tparam I1 Input 1 type
-     * @tparam O  Output type
-     *
-     * @param[in] window_in  Input region. (Must be a valid region of the window returned by window()).
-     * @param[in] window_w   Weights region. (Must be a valid region of the window returned by window()).
-     * @param[in] window_out Output region.(Must be a valid region of the window returned by window()).
-     */
-    template <typename I0, typename I1, typename O>
-    void matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out);
-    /** Common signature for all the specialised matrix vector multiplication functions */
-    using GEMMMatrixVectorMultiplyFunctionPtr = void (NEGEMMMatrixVectorMultiplyKernel::*)(const Window &window_in,
-                                                                                           const Window &window_w,
-                                                                                           const Window &window_out);
-
-private:
-    GEMMMatrixVectorMultiplyFunctionPtr _func;
-    const ITensor                      *_input0;
-    const ITensor                      *_input1;
-    ITensor                            *_output;
-    BorderSize                          _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMMATRIXVECTORMULTIPLYKERNEL_H_*/
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
deleted file mode 100644
index 967a1b73dc..0000000000
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
-#define ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor)
- *
- * Following an example of how the transposition1xW works when the input data is F32
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- *
- * Following an example of how the transposition1xW works when the input data type is F16
- *
- * @f[
- * \left( \begin{array}{cccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 \\
- * a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 \\
- * a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 \\
- * a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc}
- * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\
- * \end{array} \right)
- * @f]
- *
- * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
- *
- */
-class NEGEMMTranspose1xWKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGEMMTranspose1xWKernel";
-    }
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGatherKernel.h b/arm_compute/core/NEON/kernels/NEGatherKernel.h
deleted file mode 100644
index bfef40b53b..0000000000
--- a/arm_compute/core/NEON/kernels/NEGatherKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEGATHERKERNEL_H
-#define ARM_COMPUTE_NEGATHERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Kernel to perform other operation on NEON */
-class NEGatherKernel : public INEKernel
-{
-public:
-    /** Default constructor. */
-    NEGatherKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEGatherKernel(const NEGatherKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NEGatherKernel &operator=(const NEGatherKernel &) = delete;
-    /** Allow instances of this class to be moved. */
-    NEGatherKernel(NEGatherKernel &&) = default;
-    /** Allow instances of this class to be moved. */
-    NEGatherKernel &operator=(NEGatherKernel &&) = default;
-    /** Default detructor */
-    ~NEGatherKernel() = default;
-
-    /** Name of the kernel
-     *
-     * @return Kernel name
-     */
-    const char *name() const override
-    {
-        return "NEGatherKernel";
-    }
-    /** Initialise the kernel's inputs and outputs
-     *
-     * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input
-     * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     */
-    void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
-     *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Implementation of the gather operation for 0 axis.
-     *
-     * For gather on the 0 axis an element by element copy is performed.
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
-     * @param[in] info   Info about executing thread and CPU.
-     */
-    template <typename U>
-    void gather_0_axis(const Window &window, const ThreadInfo &info);
-
-    /** Implementation of the gather operation.
-     *
-     * For 1<=axis a row-wise copy is taking place.
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
-     * @param[in] info   Info about executing thread and CPU.
-     */
-    template <typename U>
-    void gather_n_axis(const Window &window, const ThreadInfo &info);
-
-    using kernel_ptr = void (NEGatherKernel::*)(const Window &window, const ThreadInfo &info);
-
-    const ITensor *_input;
-    const ITensor *_indices;
-    int            _axis;
-    ITensor       *_output;
-    kernel_ptr     _func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGATHERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
deleted file mode 100644
index fa92eef1b7..0000000000
--- a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
-#define ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a Gaussian 3x3 filter */
-class NEGaussian3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian3x3Kernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: S16
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
deleted file mode 100644
index 5e63e5136f..0000000000
--- a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
-#define ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */
-class NEGaussian5x5HorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian5x5HorKernel";
-    }
-    /** Default constructor */
-    NEGaussian5x5HorKernel();
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size;
-};
-
-/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */
-class NEGaussian5x5VertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian5x5VertKernel";
-    }
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: S16.
-     * @param[out] output           Destination tensor, Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
deleted file mode 100644
index 4700325b5f..0000000000
--- a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a GaussianPyramid (horizontal pass) */
-class NEGaussianPyramidHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussianPyramidHorKernel";
-    }
-    /** Default constructor */
-    NEGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** NEON kernel to perform a GaussianPyramid (vertical pass) */
-class NEGaussianPyramidVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussianPyramidVertKernel";
-    }
-    /** Default constructor */
-    NEGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: S16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data type supported: U8.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
deleted file mode 100644
index 382ce54518..0000000000
--- a/arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
-#define ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for Compute All Anchors kernel */
-class NEComputeAllAnchorsKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEComputeAllAnchorsKernel";
-    }
-
-    /** Default constructor */
-    NEComputeAllAnchorsKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComputeAllAnchorsKernel(const NEComputeAllAnchorsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComputeAllAnchorsKernel &operator=(const NEComputeAllAnchorsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEComputeAllAnchorsKernel(NEComputeAllAnchorsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEComputeAllAnchorsKernel &operator=(NEComputeAllAnchorsKernel &&) = default;
-    /** Default destructor */
-    ~NEComputeAllAnchorsKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComputeAllAnchorsKernel
-     *
-     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A), where A is the number of anchors. Data types supported: QSYMM16/F16/F32
-     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <typename T>
-    void internal_run(const Window &window);
-
-    const ITensor     *_anchors;
-    ITensor           *_all_anchors;
-    ComputeAnchorsInfo _anchors_info;
-};
-} // arm_compute
-#endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H
diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
deleted file mode 100644
index edb2da58e2..0000000000
--- a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Size2D.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform HOG Orientation Binning */
-class NEHOGOrientationBinningKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGOrientationBinningKernel";
-    }
-    /** Default constructor */
-    NEHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~NEHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised block normalization functions
-     *
-     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
-     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
-     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
-     * @param[in]  mag_stride    Stride of the magnitude tensor
-     * @param[in]  phase_stride  Stride of the phase tensor
-     * @param[in]  cell_width    Width of the cell
-     * @param[in]  cell_height   Height of the cell
-     * @param[in]  num_bins      Number of bins for each cell
-     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
-     */
-    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
-                               size_t cell_height, size_t num_bins, float phase_scale);
-    /** Orientation binning function to use for the particular cell width passed to configure() */
-    OrientBinFunc *_func;
-    const ITensor *_input_magnitude;
-    const ITensor *_input_phase;
-    ITensor       *_output;
-    size_t         _cell_width;
-    size_t         _cell_height;
-    size_t         _num_bins;
-    float          _phase_scale;
-};
-
-/** NEON kernel to perform HOG block normalization */
-class NEHOGBlockNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGBlockNormalizationKernel";
-    }
-    /** Default constructor */
-    NEHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised block normalization functions
-     *
-     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
-     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
-     * @param[in]  input_stride               Stride of the input hog space tensor
-     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
-     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
-     * @param[in]  num_bins_block             Number of total bins per block
-     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
-     */
-    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
-                               float l2_hyst_threshold);
-    /** Block normalization function to use for the particular normalization type passed to configure() */
-    BlockNormFunc *_func;
-    const ITensor *_input;
-    ITensor       *_output;
-    Size2D         _num_cells_per_block;
-    Size2D         _num_cells_per_block_stride;
-    size_t         _num_bins;
-    float          _l2_hyst_threshold;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
deleted file mode 100644
index acb35923d4..0000000000
--- a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_NEHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform HOG detector kernel using linear SVM */
-class NEHOGDetectorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGDetectorKernel";
-    }
-    /** Default constructor */
-    NEHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = delete;
-    /** Default destructor */
-    ~NEHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor         *_input;
-    IDetectionWindowArray *_detection_windows;
-    const float           *_hog_descriptor;
-    float                  _bias;
-    float                  _threshold;
-    uint16_t               _idx_class;
-    size_t                 _num_bins_per_descriptor_x;
-    size_t                 _num_blocks_per_descriptor_y;
-    size_t                 _block_stride_width;
-    size_t                 _block_stride_height;
-    size_t                 _detection_window_width;
-    size_t                 _detection_window_height;
-    size_t                 _max_num_detection_windows;
-    arm_compute::Mutex     _mutex;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGDETECTORKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
deleted file mode 100644
index a77fe16ac2..0000000000
--- a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
-
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Common interface for all Harris Score kernels */
-class INEHarrisScoreKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    INEHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~INEHarrisScoreKernel() = default;
-
-public:
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
-     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
-     * @param[out] output           Destination image (harris score). Data types supported: F32
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
-
-protected:
-    const IImage *_input1;          /**< Source image - Gx component */
-    const IImage *_input2;          /**< Source image - Gy component */
-    IImage       *_output;          /**< Source image - Harris score */
-    float         _sensitivity;     /**< Sensitivity value */
-    float         _strength_thresh; /**< Threshold value */
-    float         _norm_factor;     /**< Normalization factor */
-    BorderSize    _border_size;     /**< Border size */
-};
-
-/** Template NEON kernel to perform Harris Score.
- *  The implementation supports 3, 5, and 7 for the block_size
- */
-template <int32_t block_size>
-class NEHarrisScoreKernel : public INEHarrisScoreKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHarrisScoreKernel";
-    }
-    /** Default constructor */
-    NEHarrisScoreKernel();
-    // Inherited methods overridden:
-    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
-    BorderSize border_size() const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised harris score functions */
-    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                     float norm_factor, float sensitivity, float strength_thresh);
-    /** Harris Score function to use for the particular image types passed to configure() */
-    HarrisScoreFunction *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHARRISCORNERSKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
deleted file mode 100644
index be81f2e963..0000000000
--- a/arm_compute/core/NEON/kernels/NEHeightConcatenateLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the height concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEHeightConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHeightConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEHeightConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHeightConcatenateLayerKernel(const NEHeightConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHeightConcatenateLayerKernel &operator=(const NEHeightConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHeightConcatenateLayerKernel(NEHeightConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHeightConcatenateLayerKernel &operator=(NEHeightConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEHeightConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input         Input tensor. Data types supported: All
-     * @param[in]     height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in,out] output        Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ITensor *input, unsigned int height_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEHeightConcatenateLayerKernel
-     *
-     * @param[in] input         Input tensor info. Data types supported: All
-     * @param[in] height_offset The starting offset on the Y axis for the output tensor.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int height_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _height_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHEIGHTCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h
deleted file mode 100644
index b1dd105676..0000000000
--- a/arm_compute/core/NEON/kernels/NEHistogramKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_NEHISTOGRAMKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class IDistribution1D;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the histogram kernel */
-class NEHistogramKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHistogramKernel";
-    }
-    /** Default constructor */
-    NEHistogramKernel();
-    /** Default destructor */
-    ~NEHistogramKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramKernel(const NEHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHistogramKernel(NEHistogramKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHistogramKernel &operator=(NEHistogramKernel &&) = delete;
-
-    /** Set the input image and the distribution output.
-     *
-     * @param[in]     input      Source image. Data type supported: U8.
-     * @param[out]    output     Destination distribution.
-     * @param[in,out] local_hist Array that the threads use to save their local histograms.
-     *                           It's size should be equal to (number_of_threads * num_bins),
-     *                           and the Window::thread_id() is used to determine the part of the array
-     *                           used by each thread.
-     * @param[out]    window_lut LUT with pre-calculated possible window values.
-     *                           The size of the LUT should be equal to max_range_size and it will be filled
-     *                           during the configure stage, while it re-used in every run, therefore can be
-     *                           safely shared among threads.
-     */
-    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
-    /** Set the input image and the distribution output.
-     *
-     * @note Used for histogram of fixed size equal to 256
-     *
-     * @param[in]  input  Source image. Data type supported: U8.
-     * @param[out] output Destination distribution which must be of 256 bins..
-     */
-    void configure(const IImage *input, IDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to merge multiple partial histograms.
-     *
-     * @param[out] global_hist Pointer to the final histogram.
-     * @param[in]  local_hist  Pointer to the partial histograms.
-     * @param[in]  bins        Number of bins.
-     */
-    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
-    /** Function to merge multiple minimum values of partial histograms.
-     *
-     * @param[out] global_min Pointer to the global min value.
-     * @param[in]  local_min  Local min value.
-     */
-    void merge_min(uint8_t *global_min, const uint8_t &local_min);
-    /** Function to perform histogram on the given window
-     *
-     * @param[in] win  Region on which to execute the kernel
-     * @param[in] info Info about the executing thread
-     */
-    void histogram_U8(Window win, const ThreadInfo &info);
-    /** Function to perform histogram on the given window where histogram is
-     *         of fixed size 256 without ranges and offsets.
-     *
-     * @param[in] win  Region on which to execute the kernel
-     * @param[in] info Info about the executing thread
-     */
-    void histogram_fixed_U8(Window win, const ThreadInfo &info);
-    /** Pre-calculate the pixel windowing for every possible pixel
-     *
-     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
-     *
-     * @note We currently support U8 image thus possible pixel values are between 0 and 255
-     */
-    void calculate_window_lut() const;
-    /** Common signature for all the specialised Histogram functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window, const ThreadInfo &info);
-
-    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
-    const IImage                 *_input;
-    IDistribution1D              *_output;
-    uint32_t                     *_local_hist;
-    uint32_t                     *_window_lut;
-    arm_compute::Mutex            _hist_mtx;
-    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEHISTOGRAMKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
deleted file mode 100644
index 1c358b379d..0000000000
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEIM2COLKERNEL_H
-#define ARM_COMPUTE_NEIM2COLKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-class Size2D;
-
-/** Interface for the im2col reshape kernel.
- *
- * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
- * It is used to transform a convolution to a plain matrix multiplication.
- *
- * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
- *
- * @f[
- * \left( \begin{array}{cccc}
- * a00 & a01 & a02 & a03 \\
- * a10 & a11 & a12 & a13 \\
- * a20 & a21 & a22 & a23 \\
- * a30 & a31 & a32 & a33 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
- * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
- * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
- * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
- * \end{array} \right)
- * @f]
- */
-class NEIm2ColKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEIm2ColKernel";
-    }
-    /** Default constructor */
-    NEIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel(NEIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
-    /** Default destructor */
-    ~NEIm2ColKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
-     * @param[out] output      The output tensor. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
-     * @param[in] output      The output tensor. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run im2col
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, bool has_pads, bool is_nchw>
-    void run_im2col(const Window &window);
-
-    /** Common signature for all the specialised im2col functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
-
-    Im2ColFunctionPtr _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _kernel_width;
-    unsigned int  _kernel_height;
-    bool          _has_bias;
-    Size2D        _dilation;
-    DataLayout    _data_layout;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEIM2COLKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
deleted file mode 100644
index 7c14e409c6..0000000000
--- a/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for performing an instance normalization */
-class NEInstanceNormalizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEInstanceNormalizationLayerKernel";
-    }
-    /** Default constructor */
-    NEInstanceNormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEInstanceNormalizationLayerKernel(const NEInstanceNormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEInstanceNormalizationLayerKernel &operator=(const NEInstanceNormalizationLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEInstanceNormalizationLayerKernel(NEInstanceNormalizationLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEInstanceNormalizationLayerKernel &operator=(NEInstanceNormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEInstanceNormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in, out] input   Source tensor. Data types supported: F16/F32. Data layout supported: NCHW
-     *                         In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]      gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in]      beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in]      epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-     */
-    void configure(ITensor *input, ITensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEInstanceNormalizationLayer.
-     *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32. Data layout supported: NCHW
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
-     * @param[in] gamma   (Optional) The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in] beta    (Optional) The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in] epsilon (Optional) Lower bound value for the normalization. Defaults to 1e-12
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialized instance normalization functions
-     *
-     * @param[in, out] input   An input tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
-     * @param[out]     output  The output tensor.
-     * @param[in]      gamma   The scale scalar value applied to the normalized tensor. Defaults to 1.0
-     * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
-     * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
-     */
-    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
-
-    NormalizationFunction *_func;
-    ITensor               *_input;
-    ITensor               *_output;
-    float                  _gamma;
-    float                  _beta;
-    float                  _epsilon;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
deleted file mode 100644
index 77ae7b9efa..0000000000
--- a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform an image integral on an image */
-class NEIntegralImageKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEIntegralImageKernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8
-     * @param[out] output Destination tensor. Data type supported: U32
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-    bool       is_parallelisable() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
deleted file mode 100644
index 3937bf0163..0000000000
--- a/arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
-#define ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for performing a L2 normalize on a given axis given the square sum of it in this axis */
-class NEL2NormalizeLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEL2NormalizeLayerKernel";
-    }
-    /** Default constructor */
-    NEL2NormalizeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEL2NormalizeLayerKernel(const NEL2NormalizeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEL2NormalizeLayerKernel &operator=(const NEL2NormalizeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEL2NormalizeLayerKernel(NEL2NormalizeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEL2NormalizeLayerKernel &operator=(NEL2NormalizeLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEL2NormalizeLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input   Source tensor. Data types supported: F16/F32.
-     * @param[in]  sum     Sum values tensor. Data types supported: same as @p input.
-     *                     Sum will have the same number of dimensions as input.
-     * @param[out] output  Destination tensor. Data types and data layouts supported: same as @p input.
-     *                     Output will have the same number of dimensions as input.
-     * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in]  epsilon Lower bound value for the normalization.
-     */
-    void configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayerKernel.
-     *
-     * @param[in] input   Source tensor info. Data types supported: F16/F32.
-     * @param[in] sum     Sum values tensor info. Data types supported: same as @p input.
-     *                    Sum will have the same number of dimensions as input.
-     * @param[in] output  Destination tensor info. Data types and data layouts supported: same as @p input.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in] axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
-     * @param[in] epsilon Lower bound value for the normalization.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_sum;
-    ITensor       *_output;
-    unsigned int   _actual_axis;
-    float          _epsilon;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEL2NORMALIZELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
deleted file mode 100644
index cf99bbe691..0000000000
--- a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_LKTRACKERKERNEL_H
-#define ARM_COMPUTE_LKTRACKERKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Internal keypoint class for Lucas-Kanade Optical Flow */
-struct NELKInternalKeypoint
-{
-    float x{ 0.f };                 /**< x coordinate of the keypoint */
-    float y{ 0.f };                 /**< y coordinate of the keypoint */
-    bool  tracking_status{ false }; /**< the tracking status of the keypoint */
-};
-
-/** Interface for NEON Array of Internal Key Points. */
-using INELKInternalKeypointArray = IArray<NELKInternalKeypoint>;
-
-/** Interface for the Lucas-Kanade tracker kernel */
-class NELKTrackerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELKTrackerKernel";
-    }
-    /** Default constructor */
-    NELKTrackerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELKTrackerKernel(const NELKTrackerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELKTrackerKernel(NELKTrackerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default;
-    /** Default destructor */
-    ~NELKTrackerKernel() = default;
-
-    /** Initialise the kernel input and output
-     *
-     * @param[in]      input_old            Pointer to the input old tensor. Data type supported: U8
-     * @param[in]      input_new            Pointer to the input new tensor. Data type supported. U8
-     * @param[in]      old_scharr_gx        Pointer to the input scharr X tensor. Data type supported: S16
-     * @param[in]      old_scharr_gy        Pointer to the input scharr Y tensor. Data type supported: S16
-     * @param[in]      old_points           Pointer to the IKeyPointArray storing old key points
-     * @param[in]      new_points_estimates Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out]     new_points           Pointer to the IKeyPointArray storing new key points
-     * @param[in, out] old_points_internal  Pointer to the array of NELKInternalKeypoint for old points
-     * @param[out]     new_points_internal  Pointer to the array of NELKInternalKeypoint for new points
-     * @param[in]      termination          The criteria to terminate the search of each keypoint.
-     * @param[in]      use_initial_estimate The flag to indicate whether the initial estimated position should be used
-     * @param[in]      epsilon              The error for terminating the algorithm
-     * @param[in]      num_iterations       The maximum number of iterations before terminate the algorithm
-     * @param[in]      window_dimension     The size of the window on which to perform the algorithm
-     * @param[in]      level                The pyramid level
-     * @param[in]      num_levels           The number of pyramid levels
-     * @param[in]      pyramid_scale        Scale factor used for generating the pyramid
-     */
-    void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy,
-                   const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points,
-                   INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal,
-                   Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension,
-                   size_t level, size_t num_levels, float pyramid_scale);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Initialise the array of keypoints in the provide range
-     *
-     * @param[in] start Index of first element in the keypoints array to be initialised
-     * @param[in] end   Index after last elelemnt in the keypoints array to be initialised
-     */
-    void init_keypoints(int start, int end);
-    /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y
-     *
-     * @param[in]  keypoint    Keypoint for which gradients are computed
-     * @param[out] bilinear_ix Intermediate interpolated data for X gradient
-     * @param[out] bilinear_iy Intermediate interpolated data for Y gradient
-     *
-     * @return Values A11, A12, A22
-     */
-    std::tuple<int, int, int> compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int32_t *bilinear_ix, int32_t *bilinear_iy);
-    /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y}
-     *
-     * @param[in] old_keypoint Old keypoint for which gradient is computed
-     * @param[in] new_keypoint New keypoint for which gradient is computed
-     * @param[in] bilinear_ix  Intermediate interpolated data for X gradient
-     * @param[in] bilinear_iy  Intermediate interpolated data for Y gradient
-     *
-     * @return Values b1, b2
-     */
-    std::pair<int, int> compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int32_t *bilinear_ix, const int32_t *bilinear_iy);
-
-    const ITensor              *_input_old;
-    const ITensor              *_input_new;
-    const ITensor              *_old_scharr_gx;
-    const ITensor              *_old_scharr_gy;
-    IKeyPointArray             *_new_points;
-    const IKeyPointArray       *_new_points_estimates;
-    const IKeyPointArray       *_old_points;
-    INELKInternalKeypointArray *_old_points_internal;
-    INELKInternalKeypointArray *_new_points_internal;
-    Termination                 _termination;
-    bool                        _use_initial_estimate;
-    float                       _pyramid_scale;
-    float                       _epsilon;
-    unsigned int                _num_iterations;
-    int                         _window_dimension;
-    unsigned int                _level;
-    unsigned int                _num_levels;
-    ValidRegion                 _valid_region;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NELKTRACKERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
deleted file mode 100644
index ad2a161296..0000000000
--- a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */
-class NELocallyConnectedMatrixMultiplyKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELocallyConnectedMatrixMultiplyKernel";
-    }
-    /** Default constructor */
-    NELocallyConnectedMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output
-     *
-     * @param[in]  input0 First input tensor. Data types supported: F16, F32
-     * @param[in]  input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0
-     * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
-     */
-    void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELocallyConnectedMatrixMultiplyKernel
-     *
-     * @param[in] input0 First input tensor info. Data types supported: F16, F32
-     * @param[in] input1 Second input tensor info. Data type supported: same as @p input0
-     * @param[in] output Output tensor info. Data type supported: same as @p input0
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input0;
-    const ITensor *_input1;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
deleted file mode 100644
index 7ad5bf0d99..0000000000
--- a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Template interface for the kernel to compute magnitude and phase */
-template <MagnitudeType mag_type, PhaseType phase_type>
-class NEMagnitudePhaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMagnitudePhaseKernel";
-    }
-    /** Default constructor */
-    NEMagnitudePhaseKernel();
-    /** Destructor */
-    ~NEMagnitudePhaseKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
-    /** Default move constructor */
-    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
-    /** Default move assignment operator */
-    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
-
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of out1 or out2 must be set
-     *
-     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
-     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
-     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
-     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
-     */
-    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to perform magnitude on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void magnitude(const Window &window);
-    /** Function to perform phase on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void phase(const Window &window);
-    /** Function to perform magnitude and phase on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void magnitude_phase(const Window &window);
-
-private:
-    /** Common signature for all the specialised MagnitudePhase functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
-    /** MagnitudePhase function to use for the particular formats passed to configure() */
-    MagnitudePhaseFunctionPtr _func;
-    const ITensor            *_gx;        /**< Input gradient X */
-    const ITensor            *_gy;        /**< Input gradient Y */
-    ITensor                  *_magnitude; /**< Output - Magnitude */
-    ITensor                  *_phase;     /**< Output - Phase */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
deleted file mode 100644
index 2197e3cfbe..0000000000
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class NEMeanStdDevKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMeanStdDevKernel";
-    }
-    /** Default constructor */
-    NEMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = delete;
-    /** Default destructor */
-    ~NEMeanStdDevKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data type supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values.
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     */
-    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const IImage      *_input;
-    float             *_mean;
-    float             *_stddev;
-    uint64_t          *_global_sum;
-    uint64_t          *_global_sum_squared;
-    arm_compute::Mutex _mtx;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMEANSTDDEVKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
deleted file mode 100644
index dc0455cc4c..0000000000
--- a/arm_compute/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#include <arm_fp16.h>
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to normalize the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension. */
-class NEMeanStdDevNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMeanStdDevNormalizationKernel";
-    }
-    /** Default constructor */
-    NEMeanStdDevNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevNormalizationKernel(const NEMeanStdDevNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevNormalizationKernel &operator=(const NEMeanStdDevNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMeanStdDevNormalizationKernel(NEMeanStdDevNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMeanStdDevNormalizationKernel &operator=(NEMeanStdDevNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEMeanStdDevNormalizationKernel() = default;
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note If the output tensor is a nullptr, the normalization will be performed in-place.
-     *
-     * @param[in, out] input   Source tensor with 2 dimensions. In case of @p output tensor = nullptr,
-     *                         this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[out]     output  (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in]      epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     */
-    void configure(ITensor *input, ITensor *output = nullptr, float epsilon = 1e-8f);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEMeanStdDevNormalizationKernel
-     *
-     * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
-     *                    this tensor will store the result of the normalization. Data types supported: F16/F32.
-     * @param[in] output  (Optional) Destination tensor info. It can be nullptr in case of in-place computation. Data type supported: same as @p input
-     * @param[in] epsilon (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output = nullptr, float epsilon = 1e-8f);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Normalizes the input with respect to mean and standard deviation.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename ScalarType, int size>
-    void mean_stddev_normalization(const Window &window);
-
-    ITensor *_input;
-    ITensor *_output;
-    float    _epsilon;
-
-    using MeanStdDevNormFunction = void (NEMeanStdDevNormalizationKernel::*)(const Window &window);
-
-    MeanStdDevNormFunction _func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMEANSTDDEVNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
deleted file mode 100644
index 3e86860f79..0000000000
--- a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
-#define ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a median filter on a tensor */
-class NEMedian3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMedian3x3Kernel";
-    }
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMEDIAN3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
deleted file mode 100644
index 445e12af03..0000000000
--- a/arm_compute/core/NEON/kernels/NEMinMaxLayerKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform min max search on a 3D tensor. */
-class NEMinMaxLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLayerKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLayerKernel &operator=(const NEMinMaxLayerKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel(NEMinMaxLayerKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxLayerKernel &operator=(NEMinMaxLayerKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxLayerKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @note output[0] = minimum
-     * @note output[1] = maximum
-     *
-     * @param[in]  input  Input tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data type supported: F32.
-     * @param[out] output Output tensor with shape [2, batches, ...] which stores the minimum and maximum value for each 3D input tensor.
-     *                    The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMinMaxLayerKernel
-     *
-     * @param[in] input  Input tensor info.  Data types supported: F32.
-     * @param[in] output Output tensor info with shape [2, batches, ...] which stores the minimum and maximum values for each 3D input tensor.
-     *                   The dimensions over the second must match the batched dimensions of the input tensor. Data types supported: F32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    void update_min_max(float *out_ptr, float min, float max);
-    const ITensor     *_input;
-    ITensor           *_output;
-    arm_compute::Mutex _mtx;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMINMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
deleted file mode 100644
index 597a093d70..0000000000
--- a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the kernel to perform min max search on an image. */
-class NEMinMaxKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxKernel";
-    }
-    /** Default constructor */
-    NEMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxKernel(NEMinMaxKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min   Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max   Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const IImage *input, void *min, void *max);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Performs the min/max algorithm on U8 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_U8(Window win);
-    /** Performs the min/max algorithm on S16 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_S16(Window win);
-    /** Performs the min/max algorithm on F32 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_F32(Window win);
-    /** Common signature for all the specialised MinMax functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MinMaxFunction = void (NEMinMaxKernel::*)(Window window);
-    /** MinMax function to use for the particular image types passed to configure() */
-    MinMaxFunction _func;
-    /** Helper to update min/max values **/
-    template <typename T>
-    void update_min_max(T min, T max);
-
-    const IImage      *_input; /**< Input image. */
-    void              *_min;   /**< Minimum value. */
-    void              *_max;   /**< Maximum value. */
-    arm_compute::Mutex _mtx;   /**< Mutex used for result reduction. */
-};
-
-/** Interface for the kernel to find min max locations of an image. */
-class NEMinMaxLocationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLocationKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
-    /** Default destructor */
-    ~NEMinMaxLocationKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input     Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc   Array of minimum value locations.
-     * @param[out] max_loc   Array of maximum value locations.
-     * @param[out] min_count Number of minimum value encounters.
-     * @param[out] max_count Number of maximum value encounters.
-     */
-    void configure(const IImage *input, void *min, void *max,
-                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Performs the min/max location algorithm on T type images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
-    void minmax_loc(const Window &win);
-    /** Common signature for all the specialised MinMaxLoc functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
-    /** MinMaxLoc function to use for the particular image types passed to configure() */
-    MinMaxLocFunction _func;
-    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
-    template <class T, typename>
-    struct create_func_table;
-
-    const IImage        *_input;     /**< Input image. */
-    void                *_min;       /**< Minimum value. */
-    void                *_max;       /**< Maximum value. */
-    uint32_t            *_min_count; /**< Count of minimum value encounters. */
-    uint32_t            *_max_count; /**< Count of maximum value encounters. */
-    ICoordinates2DArray *_min_loc;   /**< Locations of minimum values. */
-    ICoordinates2DArray *_max_loc;   /**< Locations of maximum values. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
deleted file mode 100644
index 43594bacbf..0000000000
--- a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class NENonLinearFilterKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonLinearFilterKernel";
-    }
-    /** Default constructor */
-    NENonLinearFilterKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Fill mask with the corresponding given pattern.
-     *
-     * @param[in,out] mask    Mask to be filled according to pattern
-     * @param[in]     cols    Columns (width) of mask
-     * @param[in]     rows    Rows (height) of mask
-     * @param[in]     pattern Pattern to fill the mask according to
-     */
-    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
-    /** Apply a median filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_box(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_box(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_box(const Window &win);
-    /** Apply a median filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_cross(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_cross(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_cross(const Window &win);
-    /** Apply a median filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_disk(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_disk(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_disk(const Window &win);
-    /** Apply a non-linear filter when given mask has user-defined pattern.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void non_linear_filter_generic(const Window &win);
-
-private:
-    unsigned int            _border_width;
-    const ITensor          *_input;
-    ITensor                *_output;
-    const uint8_t          *_mask;
-    MatrixPattern           _pattern;
-    NonLinearFilterFunction _function;
-    unsigned int            _func_idx;
-    BorderSize              _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NENONLINEARFILTERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
deleted file mode 100644
index e2ddec9a33..0000000000
--- a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON
- *
- * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
- */
-class NENonMaximaSuppression3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3Kernel";
-    }
-    /** Default constructor */
-    NENonMaximaSuppression3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NENonMaximaSuppression3x3Kernel() = default;
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-protected:
-    /** Common signature for all the specialised non-maxima suppression 3x3 functions
-     *
-     * @param[in]  input_ptr    Pointer to the input tensor.
-     * @param[out] output_ptr   Pointer to the output tensor
-     * @param[in]  input_stride Stride of the input tensor
-     */
-    using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride);
-
-    NonMaxSuppr3x3Function *_func;   /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor          *_input;  /**< Source tensor */
-    ITensor                *_output; /**< Destination tensor */
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32
- */
-class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonMaximaSuppression3x3FP16Kernel";
-    }
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @param[in]  input            Source tensor. Data types supported: U8/F32.
-     * @param[out] output           Destination tensor. Data types supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in FP16 if the input data type is FP32 */
-using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
deleted file mode 100644
index 4727164d00..0000000000
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the normalization layer kernel.
- */
-class NENormalizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENormalizationLayerKernel";
-    }
-    /** Default constructor */
-    NENormalizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NENormalizationLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           Data type and layout supported: same as @p input.
-     * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
-     * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
-     *
-     * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                          and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                          Data type and layout supported: same as @p input.
-     * @param[in] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
-     * @param[in] norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform normalization depending on the given template
-     *  dimension. The second template parameter specifies whether the
-     *  normalization has to be 1D or 2D.
-     *
-     * @note Only supported normalizations are:
-     *  - 1D over X or Z
-     *  - 2D over X and Y
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T, unsigned int S, unsigned int dim, bool do_2D_norm>
-    void normalize_float(const Window &window);
-
-    /** Common signature for all the specialised normalization functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window);
-
-private:
-    NormalizationFunction  _func;
-    const ITensor         *_input;
-    const ITensor         *_input_squared;
-    ITensor               *_output;
-    NormalizationLayerInfo _norm_info;
-    BorderSize             _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h b/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
deleted file mode 100644
index 4cbefbd1e3..0000000000
--- a/arm_compute/core/NEON/kernels/NEPadLayerKernel.h
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPADLAYERKERNEL_H
-#define ARM_COMPUTE_NEPADLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to add padding to a tensor
- *
- * Add padding given padding information
- */
-class NEPadLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPadLayerKernel";
-    }
-    /** Default constructor */
-    NEPadLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPadLayerKernel(const NEPadLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPadLayerKernel &operator=(const NEPadLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPadLayerKernel(NEPadLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPadLayerKernel &operator=(NEPadLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEPadLayerKernel() = default;
-
-    /** Initialize the function
-     *
-     * @param[in]  input          Source tensor. Data types supported: All.
-     * @param[out] output         Output tensor. Data type supported: same as @p input
-     * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                            specifies the front and the end padding in the i-th dimension.
-     * @param[in]  constant_value (Optional) Constant value to be used for the padding
-     * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
-     *                           Only CONSTANT padding mode is currently supported
-     */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
-     *
-     * @param[in] input          Source tensor info. Data types supported: All.
-     * @param[in] output         Output tensor info. Data type supported: same as @p input
-     * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
-     *                           specifies the front and the end padding in the i-th dimension.
-     * @param[in] constant_value (Optional) Constant value to be used for the padding
-     * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
-     *                           Only CONSTANT padding mode is currently supported
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the padding function with constant padding
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_pad_constant(const Window &window);
-
-    /** Function to run the padding function with constant padding for 3D input and 1D, 2D, 3D padding
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    void run_pad_constant_uint8_3Dinput_3Dpad(const Window &window);
-
-    /** Common signature for all the specialised permute functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using PadFunctionPtr = void (NEPadLayerKernel::*)(const Window &window);
-
-    PadFunctionPtr _func;
-    const ITensor *_input;
-    ITensor       *_output;
-    PaddingList    _padding;
-    PixelValue     _constant_value;
-    PaddingMode    _mode;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPADLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
deleted file mode 100644
index 89dc4e6fc7..0000000000
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPERMUTEKERNEL_H
-#define ARM_COMPUTE_NEPERMUTEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** NEON kernel to perform tensor permutation.
- *
- * Permutes given a permutation vector
- */
-class NEPermuteKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPermuteKernel";
-    }
-    /** Default constructor */
-    NEPermuteKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPermuteKernel(const NEPermuteKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPermuteKernel &operator=(const NEPermuteKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPermuteKernel(NEPermuteKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPermuteKernel &operator=(NEPermuteKernel &&) = default;
-    /** Default destructor */
-    ~NEPermuteKernel() = default;
-
-    /** Set the input and output of the kernel.
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in]  input  The input tensor to permute. Data types supported: All
-     * @param[out] output The output tensor. Data types supported: Same as @p input
-     * @param[in]  perm   Permutation vector
-     */
-    void configure(const ITensor *input, ITensor *output, const PermutationVector &perm);
-    /** Static function to check if given info will lead to a valid configuration of @ref CPPPermuteKernel
-     *
-     * @note Arbitrary permutation vectors are supported with rank not greater than 4
-     *
-     * @param[in] input  The input tensor to permute. Data types supported: All
-     * @param[in] output The output tensor. Data types supported: Same as @p input
-     * @param[in] perm   Permutation vector
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Template function to run the permute
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T>
-    void run_permute(const Window &window);
-
-    /** Common signature for all the specialised permute functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using PermuteFunctionPtr = void (NEPermuteKernel::*)(const Window &window);
-
-    PermuteFunctionPtr _func;
-    const ITensor     *_input;
-    ITensor           *_output;
-    PermutationVector  _perm;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPERMUTEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
deleted file mode 100644
index 1a9dd6be2e..0000000000
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
-#define ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform addition between two tensors */
-class NEPixelWiseMultiplicationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPixelWiseMultiplicationKernel";
-    }
-    /** Default constructor */
-    NEPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default;
-    /** Default destructor */
-    ~NEPixelWiseMultiplicationKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          Output tensor. Data types supported:
-     *                             - U8, only if both inputs are U8.
-     *                             - QASYMM8, only if both inputs are QASYMM8.
-     *                             - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                             - S16.
-     *                             - QSYMM16, only if both inputs are QSYMM16.
-     *                             - S32, only if both inputs are QSYMM16.
-     *                             - F16, only if @p input1 is F16.
-     *                             - F32, only if both inputs are F32.
-     * @param[in]  scale           Scale to apply after multiplication.
-     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
-     * @param[in]  rounding_policy Rounding policy.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplicationKernel
-     *
-     * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
-     *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          Output tensor info. Data types supported:
-     *                            - U8, only if both inputs are U8.
-     *                            - QASYMM8, only if both inputs are QASYMM8.
-     *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
-     *                            - S16.
-     *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
-     *                            - F16, only if @p input1 is F16.
-     *                            - F32, only if both inputs are F32.
-     * @param[in] scale           Scale to apply after multiplication.
-     *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
-     * @param[in] rounding_policy Rounding policy.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Common signature for all the specialised multiplication functions with integer scaling factor
-     *
-     * @param[in]  input1_ptr Pointer to the first input tensor.
-     * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor.
-     * @param[in]  scale      Integer scale factor.
-     */
-    using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
-    /** Common signature for all the specialised multiplication functions with float scaling factor
-     *
-     * @param[in]  input1_ptr Pointer to the first input tensor.
-     * @param[in]  input2_ptr Pointer to the second input tensor.
-     * @param[out] output_ptr Pointer to the output tensor.
-     * @param[in]  scale      Float scale factor.
-     */
-    using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale);
-    /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
-     *
-     * @param[in]  input1_ptr      Pointer to the first input tensor.
-     * @param[in]  input2_ptr      Pointer to the second input tensor.
-     * @param[out] output_ptr      Pointer to the output tensor.
-     * @param[in]  scale           Float scale factor.
-     * @param[in]  input1_qua_info Quantization Info of tensor input1.
-     * @param[in]  input2_qua_info Quantization Info of tensor input2.
-     * @param[in]  output_qua_info Quantization Info of tensor output.
-     *
-     */
-    using MulFunctionQuantized = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale,
-                                      const UniformQuantizationInfo &input1_qua_info, const UniformQuantizationInfo &input2_qua_info, const UniformQuantizationInfo &output_qua_info);
-
-    MulFunctionFloat     *_func_float;
-    MulFunctionInt       *_func_int;
-    MulFunctionQuantized *_func_quantized;
-
-private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    float          _scale;
-    int            _scale_exponent;
-    bool           _run_optimized_qasymm8;
-};
-
-/** Interface for the complex pixelwise multiplication kernel. */
-class NEComplexPixelWiseMultiplicationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEComplexPixelWiseMultiplicationKernel";
-    }
-    /** Default constructor.*/
-    NEComplexPixelWiseMultiplicationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEComplexPixelWiseMultiplicationKernel &operator=(const NEComplexPixelWiseMultiplicationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel(NEComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEComplexPixelWiseMultiplicationKernel &operator=(NEComplexPixelWiseMultiplicationKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input1 An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in]  input2 An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[out] output The output tensor, Data types supported: same as @p input1.  Number of channels supported: same as @p input1.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplicationKernel
-     *
-     * @param[in] input1 An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
-     * @param[in] input2 An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     * @param[in] output The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-};
-
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
deleted file mode 100644
index b0574b7cf6..0000000000
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the pooling layer kernel */
-class NEPoolingLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPoolingLayerKernel";
-    }
-    /** Default constructor */
-    NEPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEPoolingLayerKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info, ITensor *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
-     *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform 2x2 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling2_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    void pooling2_f32_nchw_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling and compute the pooling indices. The indices can be used for max unpool.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    void pooling2_f32_nhwc_maxpool_indices(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 32-bit floating point values.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 32-bit floating point values (NHWC).
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f32_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 7x7 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling7_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 3x3 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling3_f32_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 2x2 pooling for float16_t.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling2_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform 3x3 pooling.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void pooling3_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 16-bit floating point values.
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f16_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Function to perform MxN pooling for 16-bit floating point values. (NHWC)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    void poolingMxN_f16_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform 2x2 pooling for 8bit quantized fixed point. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void pooling2_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform 3x3 pooling for 8bit quantized fixed point. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void pooling3_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform MxN pooling for 8-bit quantized. (NCHW)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void poolingMxN_q8_nchw(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Template function to perform MxN pooling for 8-bit quantized. (NHWC)
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    template <typename T>
-    void poolingMxN_q8_nhwc(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding = false);
-    /** Common signature for all the specialised Pooling functions
-     *
-     * @param[in] window_input    Input region on which to execute the kernel.
-     * @param[in] window          Output region on which to execute the kernel.
-     * @param[in] pooling_type    Pooling operation to be computed.
-     * @param[in] exclude_padding Flag to specify exclusion of padding from the operation.
-     */
-    using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window, PoolingType pooling_type, bool exclude_padding);
-
-private:
-    PoolingFunction  _func;
-    const ITensor   *_input;
-    ITensor         *_output;
-    ITensor         *_indices;
-    PoolingLayerInfo _pool_info;
-    DataLayout       _data_layout;
-    unsigned int     _num_elems_processed_per_iteration;
-    BorderSize       _border_size;
-    bool             _is_square;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h b/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
deleted file mode 100644
index 6bf6574568..0000000000
--- a/arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
-#define ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to calculate prior boxes */
-class NEPriorBoxLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEPriorBoxLayerKernel";
-    }
-    /** Default constructor */
-    NEPriorBoxLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPriorBoxLayerKernel(const NEPriorBoxLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEPriorBoxLayerKernel &operator=(const NEPriorBoxLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEPriorBoxLayerKernel(NEPriorBoxLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEPriorBoxLayerKernel &operator=(NEPriorBoxLayerKernel &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
-     * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
-     * @param[in]  info   Prior box layer info.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPriorBoxLayerKernel
-     *
-     * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
-     * @param[in] input2 Second source tensor info. Data types and layouts supported: same as @p input1
-     * @param[in] output Destination tensor info. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
-     * @param[in] info   Prior box layer info.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Stores the coordinates of the calculated prior boxes.
-     *
-     * @param[out] out        Output pointer.
-     * @param[in]  offset     Output offset to write to.
-     * @param[in]  center_x   Center pixel value on x-axis.
-     * @param[in]  center_y   Center pixel value on y-axis.
-     * @param[in]  box_width  Prior box width.
-     * @param[in]  box_height Prior box height.
-     * @param[in]  width      Input width.
-     * @param[in]  height     Input height.
-     */
-    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
-    /** Function to calculate prior boxes.
-     *
-     * @param[in] window Input region on which to execute the kernel.
-     */
-    void calculate_prior_boxes(const Window &window);
-
-    const ITensor    *_input1;
-    const ITensor    *_input2;
-    ITensor          *_output;
-    PriorBoxLayerInfo _info;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEPRIORBOXLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
deleted file mode 100644
index f5e8da7feb..0000000000
--- a/arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
-#define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include <functional>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform layer normalization */
-class NEQLSTMLayerNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEQLSTMLayerNormalizationKernel";
-    }
-    /** Default constructor */
-    NEQLSTMLayerNormalizationKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQLSTMLayerNormalizationKernel(const NEQLSTMLayerNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQLSTMLayerNormalizationKernel &operator=(const NEQLSTMLayerNormalizationKernel &) = delete;
-    /** Default Move Constructor. */
-    NEQLSTMLayerNormalizationKernel(NEQLSTMLayerNormalizationKernel &&) = default;
-    /** Default move assignment operator */
-    NEQLSTMLayerNormalizationKernel &operator=(NEQLSTMLayerNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEQLSTMLayerNormalizationKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QSYMM16.
-     * @param[out] output Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  weight Weight tensor. Data types supported: Same as @p input.
-     * @param[in]  bias   Bias tensor. Data types supported: S32
-     */
-    void configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayerNormalizationKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QSYMM16.
-     * @param[in] output Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] weight Weight tensor info. Data types supported: Same as @p input.
-     * @param[in] bias   Bias tensor info. Data types supported: S32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    // constants
-    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
-    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
-    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
-    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
-
-    using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
-
-    ComputeFuncType _fn{}; /**< Function pointer to computation function */
-
-    const ITensor *_input{ nullptr };  /**< Input tensor */
-    const ITensor *_weight{ nullptr }; /**< Weight tensor */
-    const ITensor *_bias{ nullptr };   /**< Bias tensor */
-    ITensor       *_output{ nullptr }; /**< Output tensor */
-
-    int32_t _output_multiplier{}; /**< Multiplier for output values */
-    int32_t _output_shift{};      /**< Shift value for output values */
-
-    int32_t _window_start_x{}; /**< The beginning of x-axis iteration */
-    int32_t _window_end_x{};   /**< The end of x-axis iteration */
-    int32_t _window_step_x{};  /**< The size of x-axis iteration's step */
-
-    Window _inout_window{};  /**< Window for input and output tensor */
-    Window _weight_window{}; /**< Window for weight and bias tensor */
-
-    /** Function to configure initial windows for destination of computation
-     *
-     * @param[in] Target destination tensor to use for output window
-     *
-     * @return configured window
-     */
-    Window configure_window(ITensor *target);
-    // Function to compute for data type QSYMM16
-    void compute_qsymm16();
-    /** Function to compute summation and summation of squared input of the given input pointer
-     *
-     * @param[in] Input_ptr pointer to input array
-     *
-     */
-    std::pair<int64_t, int64_t> sum_qsymm16(const int16_t *input_ptr);
-    /** Function to normalize values using computed mean and standard deviation
-     *
-     * @param[in] input_ptr     Pointer to input array
-     * @param[in] output_ptr    Pointer to output array
-     * @param[in] weight_ptr    Pointer to weight array
-     * @param[in] bias_ptr      Pointer to bias array
-     * @param[in] mean          Mean value
-     * @param[in] inv_std_mul   Quantized multiplier for standard deviation
-     * @param[in] inv_std_shift Shift for standard deviation
-     *
-     */
-    void normalize_qasymm16(const int16_t *input_ptr,
-                            int16_t       *output_ptr,
-                            const int16_t *weight_ptr,
-                            const int32_t *bias_ptr,
-                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
-    /** Function to compute output quantization information */
-    QuantizationInfo compute_output_qinfo();
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
deleted file mode 100644
index 087e767b73..0000000000
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- *
- */
-class NEQuantizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEQuantizationLayerKernel";
-    }
-    /** Default constructor */
-    NEQuantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQuantizationLayerKernel(const NEQuantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQuantizationLayerKernel &operator=(const NEQuantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEQuantizationLayerKernel(NEQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEQuantizationLayerKernel &operator=(NEQuantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEQuantizationLayerKernel() = default;
-    /** Set the input, output.
-     *
-     * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizationFunctionExecutorPtr = void (NEQuantizationLayerKernel::*)(const Window &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const Window &window);
-
-    const ITensor *_input;
-    ITensor       *_output;
-
-    QuantizationFunctionExecutorPtr _func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
deleted file mode 100644
index bebcab5359..0000000000
--- a/arm_compute/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
-#define ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the RoIAlign kernel.
- */
-class NEROIAlignLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEROIAlignLayerKernel";
-    }
-
-    /** Constructor */
-    NEROIAlignLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIAlignLayerKernel(const NEROIAlignLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIAlignLayerKernel &operator=(const NEROIAlignLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEROIAlignLayerKernel(NEROIAlignLayerKernel &&) = default;
-    /** Default move assignment operator. */
-    NEROIAlignLayerKernel &operator=(NEROIAlignLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEROIAlignLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     */
-    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
-     *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
-     *                      otherwise same as @p input
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    template <DataLayout data_layout, typename input_data_type, typename roi_data_type = input_data_type>
-    void internal_run(const Window &window, const ThreadInfo &info);
-
-    const ITensor      *_input;
-    ITensor            *_output;
-    const ITensor      *_rois;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEROIALIGNLAYERKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
deleted file mode 100644
index 59a5017711..0000000000
--- a/arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
-#define ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-#include "arm_compute/core/IArray.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the ROI pooling layer kernel */
-class NEROIPoolingLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEROIPoolingLayerKernel";
-    }
-    /** Default constructor */
-    NEROIPoolingLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIPoolingLayerKernel(const NEROIPoolingLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEROIPoolingLayerKernel &operator=(const NEROIPoolingLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEROIPoolingLayerKernel(NEROIPoolingLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEROIPoolingLayerKernel &operator=(NEROIPoolingLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEROIPoolingLayerKernel() = default;
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. Data types supported: F32.
-     * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
-     *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
-     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
-     *
-     * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled
-     * width and pooled height.
-     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
-     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
-     */
-    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor      *_input;
-    const ITensor      *_rois;
-    ITensor            *_output;
-    ROIPoolingLayerInfo _pool_info;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEROIPOOLINGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NERangeKernel.h b/arm_compute/core/NEON/kernels/NERangeKernel.h
deleted file mode 100644
index e67a5dc945..0000000000
--- a/arm_compute/core/NEON/kernels/NERangeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NERANGEKERNEL_H
-#define ARM_COMPUTE_NERANGEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel class for Range
- *
- * range generates a 1-D tensor containing a sequence of numbers that begins at 'start' and extends by increments
- * of 'step' up to but not including 'end'.
- */
-class NERangeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERangeKernel";
-    }
-    /** Default constructor */
-    NERangeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERangeKernel(const NERangeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERangeKernel &operator=(const NERangeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERangeKernel(NERangeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERangeKernel &operator=(NERangeKernel &&) = default;
-    /** Default destructor */
-    ~NERangeKernel() = default;
-    /** Initialize the kernel's output tensor, start, end and step of the sequence.
-     *
-     * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  start  The starting value of the sequence.
-     * @param[in]  end    The ending (not including) value of the sequence.
-     * @param[in]  step   The gap between each pair of values in the sequence.
-     */
-    void configure(ITensor *output, float start, float end, float step);
-    /** Static function to check if given info will lead to a valid configuration of @ref NERangeKernel
-     *
-     * @param[in] output Output tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in] start  The starting value of the sequence.
-     * @param[in] end    The ending (not including) value of the sequence.
-     * @param[in] step   The gap between each pair of values in the sequence.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *output, float start, float end, float step);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using RangeFunction = void(ITensor *output, float start, float step, const Window &window);
-
-    RangeFunction *_func;   /**< Range function to be called */
-    float          _start;  /**< Start of sequence */
-    float          _end;    /**< End of sequence */
-    float          _step;   /**< Increment/step value */
-    ITensor       *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NERANGEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h b/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
deleted file mode 100644
index 28cca4987b..0000000000
--- a/arm_compute/core/NEON/kernels/NEReductionOperationKernel.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
-#define ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a reduction operation
- *
- * @note For ARG_MIN/ARG_MAX reduction, the default data type for an uninitialized
- *       output tensor is signed 32-bit integer (S32). It is the user's responsibility
- *       to check that the results do not overflow because the indices are computed
- *       in unsigned 32-bit (U32).
- */
-class NEReductionOperationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReductionOperationKernel";
-    }
-    /** Default constructor */
-    NEReductionOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReductionOperationKernel(const NEReductionOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReductionOperationKernel &operator=(const NEReductionOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEReductionOperationKernel(NEReductionOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEReductionOperationKernel &operator=(NEReductionOperationKernel &&) = default;
-    /** Default destructor */
-    ~NEReductionOperationKernel() = default;
-
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
-     * @param[out] output Destination tensor.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
-     *                    Output will have the same number of dimensions as input.
-     * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
-     * @param[in]  op     Reduction operation to perform.
-     */
-    void configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperationKernel.
-     *
-     * @param[in] input  Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW.
-     * @param[in] output Destination tensor info.Data types and data layouts supported: same as @p input, S32 for ARG_MIX/ARG_MAX.
-     *                   Output will have the same number of dimensions as input.
-     * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
-     * @param[in] op     Reduction operation to perform.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor     *_input;
-    ITensor           *_output;
-    unsigned int       _reduction_axis;
-    ReductionOperation _op;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREDUCTIONOPERATIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index e929b1c5d4..0000000000
--- a/arm_compute/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERemapKernel";
-    }
-    /** Default constructor */
-    NERemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel(const NERemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel &operator=(const NERemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERemapKernel(NERemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERemapKernel &operator=(NERemapKernel &&) = default;
-    /** Default destructor */
-    ~NERemapKernel() = default;
-
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
-     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
-     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** function to perform nearest interpolation on the given window */
-    void remap_nearest(const Window &window);
-    /** function to perform bilinear interpolation on the given window */
-    void remap_bilinear(const Window &window);
-    /** Remap function to use for the particular interpolation type passed to configure() */
-    void (NERemapKernel::*_func)(const Window &window);
-
-    const ITensor *_input;  /**< Input image */
-    ITensor       *_output; /**< Output image */
-    const ITensor *_map_x;  /**< Input remap x coordinates */
-    const ITensor *_map_y;  /**< Input remap y coordinates */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h b/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
deleted file mode 100644
index 9277ddbe47..0000000000
--- a/arm_compute/core/NEON/kernels/NEReorgLayerKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
-#define ARM_COMPUTE_NEREORGLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor re-organization */
-class NEReorgLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReorgLayerKernel";
-    }
-    /** Default constructor */
-    NEReorgLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReorgLayerKernel(const NEReorgLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReorgLayerKernel &operator=(const NEReorgLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEReorgLayerKernel(NEReorgLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEReorgLayerKernel &operator=(NEReorgLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEReorgLayerKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     * @param[in]  stride Stride to be used during data re-organization.
-     *                    It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     */
-    void configure(const ITensor *input, ITensor *output, int32_t stride);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: All
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input
-     * @param[in] stride Stride to be used during data re-organization
-     *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t stride);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    int32_t        _stride;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREORGLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
deleted file mode 100644
index fccf2685a8..0000000000
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NERESHAPELAYERKERNEL_H
-#define ARM_COMPUTE_NERESHAPELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor reshaping */
-class NEReshapeLayerKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReshapeLayerKernel";
-    }
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data type supported: All
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NERESHAPELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEReverseKernel.h b/arm_compute/core/NEON/kernels/NEReverseKernel.h
deleted file mode 100644
index 516653b70d..0000000000
--- a/arm_compute/core/NEON/kernels/NEReverseKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREVERSEKERNEL_H
-#define ARM_COMPUTE_NEREVERSEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the reverse layer kernel. */
-class NEReverseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEReverseKernel";
-    }
-    /** Default constructor */
-    NEReverseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReverseKernel(const NEReverseKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEReverseKernel &operator=(const NEReverseKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEReverseKernel(NEReverseKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEReverseKernel &operator=(NEReverseKernel &&) = default;
-    /** Default destructor */
-    ~NEReverseKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
-     */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    const ITensor *_axis;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREVERSEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
deleted file mode 100644
index 9bc04129e0..0000000000
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESCALEKERNEL_H
-#define ARM_COMPUTE_NESCALEKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform scaling on a tensor */
-class NEScaleKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEScaleKernel";
-    }
-    /** Default constructor */
-    NEScaleKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScaleKernel(const NEScaleKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScaleKernel &operator=(const NEScaleKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEScaleKernel(NEScaleKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEScaleKernel &operator=(NEScaleKernel &&) = default;
-    /** Default destructor */
-    ~NEScaleKernel() = default;
-
-    /** Initialise the kernel's inputs, output and interpolation policy
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in]  input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in]  dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in]  dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in]  offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[out] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]  info    @ref ScaleKernelInfo to use for configuration
-     */
-    void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output,
-                   const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEScaleKernel
-     *
-     * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor
-     * @note Using @p policy Area only supports data layout NCHW and input data type U8.
-     *
-     * @param[in] input   Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32.
-     * @param[in] dx      Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32
-     * @param[in] dy      Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32
-     * @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32.
-     * @param[in] output  Destination tensor. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] info    @ref ScaleKernelInfo to use for validation
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *output,
-                           const ScaleKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** function to perform scale using nearest interpolation on the given window */
-    void scale_nearest_nchw(const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    void scale_bilinear_nchw(const Window &window);
-    /** function to perform scale using area interpolation on the given window
-     *
-     *  @note Used only in case down-sampling.
-     */
-    void scale_area_nchw(const Window &window);
-    /** function to perform scale on the given window */
-    void scale_nhwc(const Window &window);
-    /** Scale function to use for the particular interpolation type passed to configure() */
-    void (NEScaleKernel::*_func)(const Window &window);
-
-    const ITensor      *_offsets;
-    const ITensor      *_dx;
-    const ITensor      *_dy;
-    const ITensor      *_input;
-    ITensor            *_output;
-    InterpolationPolicy _policy;
-    BorderSize          _border_size;
-    BorderMode          _border_mode;
-    PixelValue          _constant_border_value;
-    float               _sampling_offset;
-    bool                _use_padding;
-    bool                _align_corners;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESCALEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
deleted file mode 100644
index 320b44d307..0000000000
--- a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESCHARR3x3KERNEL_H
-#define ARM_COMPUTE_NESCHARR3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
-* @f[
-*      \mathbf{G}_x=\begin{vmatrix}
-*      -3 & 0 & +3\\
-*      -10& 0 & +10\\
-*      -3 & 0 & +3
-*      \end{vmatrix}
-* @f]
-*/
-class NEScharr3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEScharr3x3Kernel";
-    }
-    /** Default constructor */
-    NEScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NEScharr3x3Kernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ITensor *_input;        /**< Input tensor */
-    ITensor       *_output_x;     /**< Output tensor for scharr X */
-    ITensor       *_output_y;     /**< Output tensor for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESCHARR3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESelectKernel.h b/arm_compute/core/NEON/kernels/NESelectKernel.h
deleted file mode 100644
index 51c8543ddc..0000000000
--- a/arm_compute/core/NEON/kernels/NESelectKernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESELECTKERNEL_H
-#define ARM_COMPUTE_NESELECTKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the select kernel
- *
- * Select is computed by:
- * @f[ output(i) = condition(i) ? x(i) : y(i) @f]
- *
- */
-class NESelectKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESelectKernel";
-    }
-    /** Default constructor */
-    NESelectKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESelectKernel(const NESelectKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESelectKernel &operator=(const NESelectKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESelectKernel(NESelectKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESelectKernel &operator=(NESelectKernel &&) = default;
-    /** Default destructor */
-    ~NESelectKernel() = default;
-
-    /** Common signature for all the specialised elementwise functions
-     *
-     * @param[in]  c      Condition input tensor. Data types supported: U8.
-     * @param[in]  x      First input tensor. Data types supported: All.
-     * @param[out] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in]  output Output tensor. Data types supported: Same as @p x
-     */
-    void configure(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output);
-
-    /** Validate the argument passed to the kernel
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised select functions
-     *
-     * @param[in] c      Condition input tensor. Data types supported: U8.
-     * @param[in] x      First input tensor. Data types supported: All.
-     * @param[in] y      Second input tensor. Data types supported: Same as @p x
-     * @param[in] output Output tensor. Data types supported: Same as @p x.
-     */
-    using SelectFunction = void(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window);
-
-    /** Select function to use for the particular tensor types passed to configure() */
-    SelectFunction *_function;
-    const ITensor *_c;              /**< Condition tensor */
-    const ITensor *_x;              /**< Source tensor 1 */
-    const ITensor *_y;              /**< Source tensor 2 */
-    ITensor        *_output;        /**< Destination tensor */
-    bool            _has_same_rank; /**< Flag that indicates if condition tensor and other inputs have the same rank */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESELECTKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
deleted file mode 100644
index ef0db2a428..0000000000
--- a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL3x3KERNEL_H
-#define ARM_COMPUTE_NESOBEL3x3KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -1 & 0 & +1\\
- *      -2 & 0 & +2\\
- *      -1 & 0 & +1
- *      \end{vmatrix}
- * @f]
-*/
-class NESobel3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel3x3Kernel";
-    }
-    /** Default constructor */
-    NESobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NESobel3x3Kernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< Output tensor for sobel X */
-    ITensor       *_output_y;    /**< Output tensor for sobel Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL3x3KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
deleted file mode 100644
index bc0cfb016e..0000000000
--- a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL5x5KERNEL_H
-#define ARM_COMPUTE_NESOBEL5x5KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
- *
- */
-class NESobel5x5HorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel5x5HorKernel";
-    }
-    /** Default constructor */
-    NESobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~NESobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< X output of horizontal pass */
-    ITensor       *_output_y;    /**< Y output of horizontal pass */
-    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
-    BorderSize     _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
- *
-*/
-class NESobel5x5VertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel5x5VertKernel";
-    }
-    /** Default constructor */
-    NESobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~NESobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
-     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
-     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    ITensor *_input_x;     /**< X input (X output of the hor pass) */
-    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
-    ITensor *_output_x;    /**< X output of sobel */
-    ITensor *_output_y;    /**< Y output of sobel */
-    bool     _run_sobel_x; /**< Do we need to run sobel X? */
-    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL5x5KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
deleted file mode 100644
index 468a94d0d1..0000000000
--- a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL7x7KERNEL_H
-#define ARM_COMPUTE_NESOBEL7x7KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
- *
- */
-class NESobel7x7HorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel7x7HorKernel";
-    }
-    /** Default constructor */
-    NESobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~NESobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< X output of horizontal pass */
-    ITensor       *_output_y;    /**< Y output of horizontal pass */
-    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
-    BorderSize     _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
- *
-*/
-class NESobel7x7VertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel7x7VertKernel";
-    }
-    /** Default constructor */
-    NESobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~NESobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set
-     * @note If output_x is set then input_x must be set too
-     * @note If output_y is set then input_y must be set too
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
-    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
-    ITensor       *_output_x;    /**< X output of sobel */
-    ITensor       *_output_y;    /**< Y output of sobel */
-    bool           _run_sobel_x; /**< Do we need to run sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL7x7KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
deleted file mode 100644
index 0e0be7936b..0000000000
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
-#define ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the identifying the max value of 1D Logits */
-class NELogits1DMaxKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NELogits1DMaxKernel";
-    }
-    /** Default constructor */
-    NELogits1DMaxKernel();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
-     *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Destination tensor. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    using Logits1DMaxFunction = void(const ITensor &in, ITensor &out, const Window &window);
-
-private:
-    Logits1DMaxFunction *_func;
-    BorderSize           _border_size;
-};
-
-/** Interface for softmax computation for QASYMM8 with pre-computed max. */
-template <bool IS_LOG = false>
-class NELogits1DSoftmaxKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        if(IS_LOG)
-        {
-            return "NELogits1DSoftmaxKernel";
-        }
-        else
-        {
-            return "NELogits1DLogSoftmaxKernel";
-        }
-    }
-    /** Default constructor */
-    NELogits1DSoftmaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DSoftmaxKernel(const NELogits1DSoftmaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELogits1DSoftmaxKernel &operator=(const NELogits1DSoftmaxKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NELogits1DSoftmaxKernel(NELogits1DSoftmaxKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NELogits1DSoftmaxKernel &operator=(NELogits1DSoftmaxKernel &&) = default;
-    /** Default destructor */
-    ~NELogits1DSoftmaxKernel() = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  max    Max values tensor. Same shape as input with dimension 0 set to 1.
-     *                    Data types supported: same as @p input.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[in]  beta   A scaling factor for the exponent.
-     *
-     * @param      tmp    Auxiliary tensor. Must be type F32 and same shape as the input.
-     */
-    void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] max    Max values tensor info. Same shape as input with dimension 0 set to 1.
-     *                   Data types supported: same as @p input.
-     * @param[in] output Destination tensor info. Data types supported: same as @p input.
-     * @param[in] beta   A scaling factor for the exponent.
-     * @param[in] tmp    Tensor info of auxiliary. Must be type F32 and same shape as the input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *max,
-                           const ITensorInfo *output, const float beta, const ITensorInfo *tmp);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using LogitsSoftmaxFunction = void(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta,
-                                       const Window &window);
-
-    LogitsSoftmaxFunction *_func;
-    const ITensor         *_input;
-    const ITensor         *_max;
-    ITensor               *_output;
-    float                  _beta;
-    ITensor               *_tmp; //Temporary. Used internally
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
deleted file mode 100644
index 532fbb2852..0000000000
--- a/arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
-#define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declaration
-class ITensor;
-
-/** Interface for the space to batch kernel */
-class NESpaceToBatchLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESpaceToBatchLayerKernel";
-    }
-    /** Default constructor */
-    NESpaceToBatchLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToBatchLayerKernel(const NESpaceToBatchLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToBatchLayerKernel &operator=(const NESpaceToBatchLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESpaceToBatchLayerKernel(NESpaceToBatchLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESpaceToBatchLayerKernel &operator=(NESpaceToBatchLayerKernel &&) = default;
-    /** Default destructor */
-    ~NESpaceToBatchLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output);
-    /** Initialise the kernel's input and output. (Static block shape and paddings)
-     *
-     * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape_x Block shape x value.
-     * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
-     * @param[out] output        Tensor output. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
-     *
-     * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in] paddings    2-D tensor with shape [2, M]. Data types supported: S32
-     * @param[in] output      Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
-     *
-     * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape_x Block shape x value.
-     * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
-     * @param[in] output        Tensor output. Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    const ITensor *_block_shape; /**< Block shape tensor */
-    const ITensor *_paddings;    /**< Paddings tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    DataLayout     _data_layout; /**< Data layout to be used at run-time */
-
-    Size2D _padding_left;
-    int    _block_shape_x;
-    int    _block_shape_y;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
deleted file mode 100644
index e0c22e65fb..0000000000
--- a/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
-#define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the space to depth kernel */
-class NESpaceToDepthLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESpaceToDepthLayerKernel";
-    }
-    /** Default constructor */
-    NESpaceToDepthLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToDepthLayerKernel(const NESpaceToDepthLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESpaceToDepthLayerKernel &operator=(const NESpaceToDepthLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESpaceToDepthLayerKernel(NESpaceToDepthLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESpaceToDepthLayerKernel &operator=(NESpaceToDepthLayerKernel &&) = default;
-    /** Default destructor */
-    ~NESpaceToDepthLayerKernel() = default;
-    /** Initialise the kernel's inputs and output.
-     *
-     * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[out] output      Tensor output. Data types supported: same as @p input
-     * @param[in]  block_shape Block shape value
-     */
-    void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToDepthLayerKernel
-     *
-     * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] output      Tensor output info. Data types supported: same as @p input
-     * @param[in] block_shape Block shape value
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;       /**< Source tensor */
-    ITensor       *_output;      /**< Destination tensor */
-    int32_t        _block_shape; /**< Block shape */
-    DataLayout     _data_layout; /**< Data layout  of the operation */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h b/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
deleted file mode 100644
index c4dc53eac6..0000000000
--- a/arm_compute/core/NEON/kernels/NEStackLayerKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NESTACKLAYERKERNEL_H
-#define ARM_COMPUTE_NESTACKLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to stacks a rank-R tensor into one with rank-(R+1) along the axis dimension.*/
-class NEStackLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEStackLayerKernel";
-    }
-    /** Default constructor */
-    NEStackLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStackLayerKernel(const NEStackLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStackLayerKernel &operator=(const NEStackLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEStackLayerKernel(NEStackLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEStackLayerKernel &operator=(NEStackLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEStackLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in]  input       Input tensor. Data types supported: All
-     * @param[in]  axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in]  idx_input   Index of the input tensor in the list of tensors to stack.
-     *                         All tensors in the list must have the same shape
-     * @param[in]  num_tensors Number of tensors to stack
-     * @param[out] output      Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
-     *
-     * @note Supported input tensor rank: up to 4
-     *
-     * @param[in] input       Input tensor info. Data types supported: All
-     * @param[in] axis        The dimension to stack the tensors along. It must be smaller than the number of input dimensions.
-     * @param[in] idx_input   Index of the input tensor in the list of tensors to stack
-     *                        All tensors in the list must have the same shape
-     * @param[in] num_tensors Number of tensors to stack
-     * @param[in] output      Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
-
-    // Inherited methods overridden
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _axis;
-    unsigned int   _idx_input;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h b/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
deleted file mode 100644
index 6709619a62..0000000000
--- a/arm_compute/core/NEON/kernels/NEStridedSliceKernel.h
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
-#define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the kernel to perform tensor strided slicing */
-class NEStridedSliceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEStridedSliceKernel";
-    }
-    /** Default constructor */
-    NEStridedSliceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStridedSliceKernel(const NEStridedSliceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEStridedSliceKernel &operator=(const NEStridedSliceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEStridedSliceKernel(NEStridedSliceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEStridedSliceKernel &operator=(NEStridedSliceKernel &&) = default;
-    /** Default destructor */
-    ~NEStridedSliceKernel() = default;
-    /** Configure kernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in]  input            Source tensor. Data type supported: All
-     * @param[out] output           Destination tensor. Data type supported: Same as @p input
-     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in]  begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    void configure(const ITensor *input, ITensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
-     *
-     * @note Supported tensor rank: up to 4
-     *
-     * @param[in] input            Source tensor info. Data type supported: All
-     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
-     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
-     * @param[in] begin_mask       If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] end_mask         If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
-     * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
-     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;         /**< Source tensor */
-    ITensor       *_output;        /**< Destination tensor */
-    Coordinates    _starts_abs;    /**< Absolute start coordinates */
-    Coordinates    _final_strides; /**< Final strides */
-    int32_t        _shrink_mask;   /**< Shrink axis mask */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h
deleted file mode 100644
index 13a76cb40e..0000000000
--- a/arm_compute/core/NEON/kernels/NETableLookupKernel.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_NETABLELOOKUPKERNEL_H
-
-#include "arm_compute/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-class ILut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class NETableLookupKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NETableLookupKernel";
-    }
-    /** Default constructor */
-    NETableLookupKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETableLookupKernel(const NETableLookupKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETableLookupKernel(NETableLookupKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8/S16.
-     * @param[in]  lut    The input LUT.
-     * @param[out] output The output tensor. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ILut *lut, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Perform table lookup on a given window.
-     *
-     * @param window window Region on which to execute the kernel.
-     */
-    template <class T>
-    void tableLookup(const Window &window);
-    /** Common signature for all the specialised lut functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
-    /** Sub function to use for the particular tensor types passed to configure() */
-    TableLookupFunction _func;
-    const ILut         *_lut;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NETABLELOOKUPKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h
deleted file mode 100644
index a6d1e9071c..0000000000
--- a/arm_compute/core/NEON/kernels/NEThresholdKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETHRESHOLDKERNEL_H
-#define ARM_COMPUTE_NETHRESHOLDKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the thresholding kernel
- *
- */
-class NEThresholdKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEThresholdKernel";
-    }
-    /** Constructor
-     * Initialize all the pointers to nullptr and parameters to zero.
-     */
-    NEThresholdKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEThresholdKernel(const NEThresholdKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
-    /** Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input       An input tensor. Data type supported: U8
-     * @param[out] output      The output tensor. Data type supported: U8.
-     * @param[in]  threshold   Threshold. When the threhold type is RANGE, this is used as the lower threshold.
-     * @param[in]  false_value value to set when the condition is not respected.
-     * @param[in]  true_value  value to set when the condition is respected.
-     * @param[in]  type        Thresholding type. Either RANGE or BINARY.
-     * @param[in]  upper       Upper threshold. Only used when the thresholding type is RANGE.
-     */
-    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** run binary thresholding on the given window */
-    void run_binary(const Window &window);
-    /** run range thresholding on the given window */
-    void run_range(const Window &window);
-
-    void (NEThresholdKernel::*_func)(const Window &window);
-
-    const ITensor *_input;  /**< Input */
-    ITensor       *_output; /**< Output */
-    uint8_t        _threshold;
-    uint8_t        _false_value;
-    uint8_t        _true_value;
-    uint8_t        _upper;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NETileKernel.h b/arm_compute/core/NEON/kernels/NETileKernel.h
deleted file mode 100644
index a64470ffd0..0000000000
--- a/arm_compute/core/NEON/kernels/NETileKernel.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETILEKERNEL_H
-#define ARM_COMPUTE_NETILEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform a tile operation */
-class NETileKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    NETileKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NETileKernel(const NETileKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    NETileKernel &operator=(const NETileKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETileKernel(NETileKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETileKernel &operator=(NETileKernel &&) = default;
-    const char   *name() const override
-    {
-        return "NETileKernel";
-    }
-    /** Set the source, destination of the kernel
-     *
-     * @param[in]  input     Source tensor. Data type supported: All.
-     * @param[out] output    Destination tensor. Same as @p input
-     * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     */
-    void configure(const ITensor *input, ITensor *output, const Multiples &multiples);
-    /** Static function to check if given info will lead to a valid configuration of @ref NETileKernel
-     *
-     * @param[in] input     Source tensor info. Data type supported: All.
-     * @param[in] output    Destination tensor info. Same as @p input
-     * @param[in] multiples Contains the number of times the input tensor should be replicated on the given dimension.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Multiples &multiples);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETILEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
deleted file mode 100644
index a14dece0d6..0000000000
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETRANSPOSEKERNEL_H
-#define ARM_COMPUTE_NETRANSPOSEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel which transposes the elements of a matrix.
- *
- * [width, height, batch] -> [height, width, batch]
- *
- */
-class NETransposeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NETransposeKernel";
-    }
-    /** Default constructor */
-    NETransposeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETransposeKernel(const NETransposeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETransposeKernel &operator=(const NETransposeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETransposeKernel(NETransposeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETransposeKernel &operator=(NETransposeKernel &&) = default;
-    /** Default destructor */
-    ~NETransposeKernel() = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
-     *
-     * @param[in] input  Input tensor. Data types supported: All
-     * @param[in] output Output tensor. Data type supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the transpose functions
-     *
-     * @param[in]  input  An input tensor. Data types supported: All
-     * @param[out] output The output tensor. Data type supported: same as @p input
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window);
-    /** Transpose function to use for the particular tensor types passed to configure() */
-    TransposeFunction *_func;
-    const ITensor     *_input;
-    ITensor           *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NETRANSPOSEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h b/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
deleted file mode 100644
index 1ea3f974e7..0000000000
--- a/arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
-#define ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the Upsample layer kernel.*/
-class NEUpsampleLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEUpsampleLayerKernel";
-    }
-    /** Default constructor */
-    NEUpsampleLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEUpsampleLayerKernel(const NEUpsampleLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEUpsampleLayerKernel &operator=(const NEUpsampleLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEUpsampleLayerKernel(NEUpsampleLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEUpsampleLayerKernel &operator=(NEUpsampleLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEUpsampleLayerKernel() = default;
-    /** Set the input output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info   Contains stride information described in @ref Size2D.
-     * @param[in]  policy Defines the policy to fill the intermediate pixels.
-     *
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &info, const InterpolationPolicy policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEUpsampleLayerKernel
-     *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] output Destination tensor info. Data types supported: same as @p input.
-     * @param[in] info   Contains stride information described in @ref Size2D.
-     * @param[in] policy Defines the policy to fill the intermediate pixels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info, const InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to run upsample layer (NCHW)
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, int S>
-    void upsample_nchw(const Window &window);
-    /** Function to run upsample layer (NHWC)
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <typename T, int S>
-    void upsample_nhwc(const Window &window);
-
-    using UpsampleFunctionPtr = void (NEUpsampleLayerKernel::*)(const Window &window);
-
-private:
-    UpsampleFunctionPtr _func;
-    const ITensor      *_input;
-    ITensor            *_output;
-    Size2D              _info;
-    unsigned int        _num_elems_processed_per_iteration_x;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEUPSAMPLELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h
deleted file mode 100644
index 61ca21eb48..0000000000
--- a/arm_compute/core/NEON/kernels/NEWarpKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWARPKERNEL_H
-#define ARM_COMPUTE_NEWARPKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-#include <array>
-#include <cstdint>
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for warp affine and warp perspective */
-class INEWarpKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    INEWarpKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWarpKernel(const INEWarpKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEWarpKernel(INEWarpKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEWarpKernel &operator=(INEWarpKernel &&) = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[out] output                Destination tensor. Data type supported: U8.
-     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
-     *                                   The matrix argument requires 9 values, for the affine case the last 3 values are ignored.
-     * @param[in]  border_mode           Strategy to use for borders
-     * @param[in]  constant_border_value Constant value used for filling the border.
-     */
-    virtual void configure(const ITensor *input, ITensor *output, const std::array<float, 9> &matrix, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-protected:
-    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_undefined(const Window &window) = 0;
-    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_constant(const Window &window) = 0;
-    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_replicate(const Window &window) = 0;
-    /** Common signature for all the specialised warp functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    void (INEWarpKernel::*_func)(const Window &window);
-
-    const ITensor *_input;                 /**< Input Tensor */
-    ITensor       *_output;                /**< Output Tensor */
-    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
-    std::array<float, 9> _matrix;          /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
-};
-
-/** Template interface for the kernel to compute warp affine
- *
- */
-template <InterpolationPolicy interpolation>
-class NEWarpAffineKernel : public INEWarpKernel
-{
-private:
-    const char *name() const override
-    {
-        return "NEWarpAffineKernel";
-    }
-    // Inherited methods overridden:
-    void warp_undefined(const Window &window) override;
-    void warp_constant(const Window &window) override;
-    void warp_replicate(const Window &window) override;
-};
-
-/** Template interface for the kernel to compute warp perspective
- *
- */
-template <InterpolationPolicy interpolation>
-class NEWarpPerspectiveKernel : public INEWarpKernel
-{
-private:
-    const char *name() const override
-    {
-        return "NEWarpPerspectiveKernel";
-    }
-    // Inherited methods overridden:
-    void warp_undefined(const Window &window) override;
-    void warp_constant(const Window &window) override;
-    void warp_replicate(const Window &window) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWARPKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
deleted file mode 100644
index b68cb50c7b..0000000000
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer
- *
- * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
- *
- * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
- * @f[
- * \left( \begin{array}{ccc}
- * a000 & a001 & a002 \\
- * a010 & a011 & a012 \\
- * a020 & a021 & a022 \\
- * \end{array} \right)
- * \left( \begin{array}{ccc}
- * a100 & a101 & a102 \\
- * a110 & a111 & a112 \\
- * a120 & a121 & a122 \\
- * \end{array} \right)
- * \rightarrow
- * \left( \begin{array}{ccccccccc}
- * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
- * \end{array} \right)
- * @f]
- */
-class NEWeightsReshapeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWeightsReshapeKernel";
-    }
-    /** Constructor.*/
-    NEWeightsReshapeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
-    /** Default destructor */
-    ~NEWeightsReshapeKernel() = default;
-    /** Set the input and output of the kernel.
-     *
-     * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
-     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/FP16/F32
-     * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[out] output The output tensor. Data types supported: Same as @p input
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
-     *
-     * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared.
-     *                   Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32
-     * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
-     *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
-     * @param[in] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    const ITensor *_bias;
-    ITensor       *_output;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
deleted file mode 100644
index f22f18f09f..0000000000
--- a/arm_compute/core/NEON/kernels/NEWidthConcatenateLayerKernel.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
-#define ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the width concatenate kernel.
- *  The input tensor will be concatenated into the output tensor.
- */
-class NEWidthConcatenateLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWidthConcatenateLayerKernel";
-    }
-    /** Default constructor */
-    NEWidthConcatenateLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWidthConcatenateLayerKernel(const NEWidthConcatenateLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWidthConcatenateLayerKernel &operator=(const NEWidthConcatenateLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWidthConcatenateLayerKernel(NEWidthConcatenateLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWidthConcatenateLayerKernel &operator=(NEWidthConcatenateLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEWidthConcatenateLayerKernel() = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]     input        Input tensor. Data types supported: All
-     * @param[in]     width_offset The offset on the X axis.
-     * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
-     *
-     */
-    void configure(const ITensor *input, unsigned int width_offset, ITensor *output);
-    /**  Static function to check if given info will lead to a valid configuration of @ref NEWidthConcatenateLayerKernel
-     *
-     * @param[in] input        Input tensor info. Data types supported: All
-     * @param[in] width_offset The offset on the X axis.
-     * @param[in] output       Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, unsigned int width_offset, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor *_input;
-    ITensor       *_output;
-    unsigned int   _width_offset;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEWIDTHCONCATENATELAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
deleted file mode 100644
index 1740df0312..0000000000
--- a/arm_compute/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/convolution/common/convolution.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/common/tensor.hpp"
-#include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Interface for the NEON kernel to perform Winograd input transform. */
-class INEWinogradLayerTransformInputKernel : public INEKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param num_threads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    virtual unsigned int get_input_storage_size(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_channels, int num_rows, int num_cols, bool same_padding) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor in NHWC data layout format.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    virtual void configure(const ITensor *input_nhwc, const int num_batches, const int num_rows, const int num_cols, const int num_channels,
-                           const PaddingType padding, ITensor *output, const int matrix_stride, ITensor *workspace) = 0;
-
-    /** Destructor */
-    virtual ~INEWinogradLayerTransformInputKernel()
-    {
-    }
-};
-
-/** NEON kernel to perform Winograd input transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformInputKernel : public INEWinogradLayerTransformInputKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformInputKernel(const NEWinogradLayerTransformInputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformInputKernel &operator=(const NEWinogradLayerTransformInputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformInputKernel(NEWinogradLayerTransformInputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformInputKernel &operator=(NEWinogradLayerTransformInputKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformInputKernel() = default;
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Storage size (in units of TIn) required.
-     */
-    unsigned int get_input_storage_size(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_batches  Number of batches in the input tensor.
-     * @param[in] num_channels Number of feature maps in the input tensor.
-     * @param[in] num_rows     Number of rows in each feature map.
-     * @param[in] num_cols     Number of columns in each feature map.
-     * @param[in] same_padding Use "SAME" padding, otherwise use "VALID".
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(
-        int  num_batches,
-        int  num_channels,
-        int  num_rows,
-        int  num_cols,
-        bool same_padding) const override;
-
-    /** Default constructor */
-    NEWinogradLayerTransformInputKernel();
-
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformInputKernel";
-    }
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  input_nhwc    Input tensor.  Data types supported: F16/F32. Layout supported NHWC.
-     * @param[in]  num_batches   Number of batches in input tensor.
-     * @param[in]  num_rows      Number of rows in input tensor.
-     * @param[in]  num_cols      Number of columns in input tensor.
-     * @param[in]  num_channels  Number of channels in input tensor.
-     * @param[in]  padding       Padding type.
-     * @param[out] output        Base of output matrices.
-     * @param[in]  matrix_stride Stride between output matrices.
-     * @param[in]  workspace     Tensor to be used as the working space during the computation.
-     */
-    void configure(
-        const ITensor    *input_nhwc,
-        const int         num_batches,
-        const int         num_rows,
-        const int         num_cols,
-        const int         num_channels,
-        const PaddingType padding,
-        ITensor          *output,
-        const int         matrix_stride,
-        ITensor          *workspace) override;
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-    using WinogradConv = typename WinogradBase::template Convolution<T, T>;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformInputKernel
-     *
-     * @param[in] input         First tensor input info. Data types supported: F16/F32.
-     * @param[in] output        Output tensor info. Data types supported: same as @p input.
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using InputTransform = typename WinogradBase::template InputTransform<T, T>;
-
-    std::unique_ptr<InputTransform> _transform{ nullptr };
-    const ITensor                  *_input_nhwc;
-    int                             _num_batches;    /**< Number of batches in input tensor. */
-    int                             _num_rows;       /**< Number of rows in input tensor. */
-    int                             _num_cols;       /**< Number of columns in input tensor. */
-    int                             _num_channels;   /**< Number of channels in input tensor. */
-    PaddingType                     _padding;        /**< Padding type. */
-    ITensor                        *_output;         /**< Base of output matrices. */
-    int                             _matrix_stride;  /**< Stride between output matrices. */
-    int                             _padding_top;    /**< Padding to apply to the top of the image. */
-    int                             _padding_left;   /**< Padding to apply to the left of the image. */
-    int                             _padding_right;  /**< Padding to apply to the right of the image. */
-    int                             _padding_bottom; /**< Padding to apply to the bottom of the image. */
-    ITensor                        *_workspace;
-};
-
-/** Interface for the NEON kernel to perform Winograd output transform. */
-class INEWinogradLayerTransformOutputKernel : public INEKernel
-{
-public:
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    virtual unsigned int get_working_space_size(unsigned int num_threads) const = 0;
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    virtual unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const = 0;
-
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    virtual std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows,    /* Number of rows in each feature map of the input tensor. */
-        int  num_cols,    /* Number of columns in each feature map of the input tensor. */
-        bool padding_same /* True if padding is SAME, false otherwise */
-    ) const = 0;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor in NHWC data layout ordered output tensor, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    virtual void configure(
-        const ITensor              *biases,
-        const ITensor              *transformed_output,
-        const int                   matrix_stride,
-        ITensor                    *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensor                    *workspace,
-        const arm_gemm::Activation &activation) = 0;
-
-    virtual ~INEWinogradLayerTransformOutputKernel()
-    {
-    }
-};
-
-/** NEON kernel to perform Winograd output transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformOutputKernel : public INEWinogradLayerTransformOutputKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformOutputKernel";
-    }
-    /** Constructor */
-    NEWinogradLayerTransformOutputKernel();
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformOutputKernel(const NEWinogradLayerTransformOutputKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformOutputKernel &operator=(const NEWinogradLayerTransformOutputKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformOutputKernel(NEWinogradLayerTransformOutputKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformOutputKernel &operator=(NEWinogradLayerTransformOutputKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformOutputKernel() = default;
-
-    // Inherited methods overridden:
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Storage size (in units of TOut) required.
-     */
-    unsigned int get_output_storage_size(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-
-    /** Gets the stride between matrices in the output worspace
-     *
-     * @param[in] num_batches         Number of batches in the output tensor.
-     * @param[in] num_rows            Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols            Number of columns in each feature map of the input tensor.
-     * @param[in] num_output_channels Number of feature maps in the output tensor.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_batches, int num_rows, int num_cols, int num_output_channels) const override;
-    /** Get the output shape of a convolution.
-     *
-     * @param[in] num_rows     Number of rows in each feature map of the input tensor.
-     * @param[in] num_cols     Number of columns in each feature map of the input tensor.
-     * @param[in] padding_same True if padding is SAME, false otherwise
-     *
-     * @return Shape of the output tensor
-     */
-    std::pair<unsigned int, unsigned int> get_output_shape(
-        int  num_rows, /* Number of rows in each feature map of the input tensor. */
-        int  num_cols, /* Number of columns in each feature map of the input tensor. */
-        bool padding_same) const override;
-
-    /** Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param[in] num_threads The greatest number of threads that will be used to execute the transform.
-     *
-     * @return Size of working space required in bytes.
-     */
-    unsigned int get_working_space_size(unsigned int num_threads) const override;
-
-    /** Configure the output transform kernel.
-     *
-     * @param[in]  biases             Pointer to the biases tensor.
-     * @param[in]  transformed_output Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride      Output matrix stride, can be computed with winograd::WinogradGEMM<2, 2, 3, 3>::Convolution<float, float>::get_output_matrix_stride()
-     * @param[out] output_nhwc        Pointer to a tensor with NHWC data layout, in the spatial domain.
-     * @param[in]  num_batches        Number of batches in the input tensor.
-     * @param[in]  num_rows           Number of rows in output tensor.
-     * @param[in]  num_cols           Number of columns in output tensor.
-     * @param[in]  num_channels       Number of feature maps in the output tensor.
-     * @param[in]  workspace          Tensor to be used as the working space during the computation.
-     * @param[in]  activation         Activation to be used
-     */
-    void configure(
-        const ITensor              *biases,
-        const ITensor              *transformed_output,
-        const int                   matrix_stride,
-        ITensor                    *output_nhwc,
-        const int                   num_batches,
-        const int                   num_rows,
-        const int                   num_cols,
-        const int                   num_channels,
-        ITensor                    *workspace,
-        const arm_gemm::Activation &activation) override;
-
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformOutputKernel
-     *
-     * @param[in] input         Source tensor info with shape [C, N, 16, batches] or [C, N, 36, batches]. Data types supported: F16/F32.
-     * @param[in] bias          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p input
-     * @param[in] output        Destination tensor info with shape [output_convolved_dims.width, output_convolved_dims.height, C, batches]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-private:
-    using WinogradBase    = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv    = typename WinogradBase::template Convolution<T, T>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<T, T>;
-
-    std::unique_ptr<OutputTransform> _transform{ nullptr };
-    const ITensor                   *_biases;
-    const ITensor                   *_transformed_output;
-    ITensor                         *_workspace;
-    int                              _matrix_stride;
-    int                              _matrix_row_stride;
-    ITensor                         *_output_nhwc;
-    int                              _num_batches;
-    int                              _num_rows;
-    int                              _num_cols;
-    int                              _num_channels;
-};
-
-/** Interface for the NEON kernel to perform Winograd weights transform. */
-class INEWinogradLayerTransformWeightsKernel : public INEKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWinogradLayerTransformWeightsKernel(const INEWinogradLayerTransformWeightsKernel &) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWinogradLayerTransformWeightsKernel &operator=(const INEWinogradLayerTransformWeightsKernel &) = default;
-    /** Allow instances of this class to be moved */
-    INEWinogradLayerTransformWeightsKernel(INEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEWinogradLayerTransformWeightsKernel &operator=(INEWinogradLayerTransformWeightsKernel &&) = default;
-
-    INEWinogradLayerTransformWeightsKernel()
-    {
-    }
-    virtual ~INEWinogradLayerTransformWeightsKernel()
-    {
-    }
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    virtual unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const = 0;
-    /** Gets the stride between matrices in the kernel worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    virtual int get_matrix_stride(int num_output_channels, int num_input_channels) const = 0;
-
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-
-    virtual void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) = 0;
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
-     *
-     * @param[in] input   First tensor input info. Data types supported: F16/F32.
-     * @param[in] weights Weights tensor info. Data types supported: same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights);
-};
-
-/** NEON kernel to perform Winograd weights transform. */
-template <typename T, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerTransformWeightsKernel final : public INEWinogradLayerTransformWeightsKernel
-{
-public:
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformWeightsKernel(const NEWinogradLayerTransformWeightsKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradLayerTransformWeightsKernel &operator=(const NEWinogradLayerTransformWeightsKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformWeightsKernel(NEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEWinogradLayerTransformWeightsKernel &operator=(NEWinogradLayerTransformWeightsKernel &&) = default;
-    /** Default destructor */
-    ~NEWinogradLayerTransformWeightsKernel() = default;
-
-    /** Default constructor. */
-    NEWinogradLayerTransformWeightsKernel();
-    const char *name() const override
-    {
-        return "NEWinogradLayerTransformWeightsKernel";
-    }
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradLayerTransformWeightsKernel
-     *
-     * @param[in] input         Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout).
-     *                          kernel_x must be 3 and equal to kernel_y. Data types supported: F16/F32.
-     * @param[in] output        Destination tensor info. The output is a 3D tensor with dimensions [OFM, IFM, 16] or [OFM, IFM, 36]. Data type supported: same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-
-    // Inherited methods overridden:
-
-#ifndef DOXYGEN_SKIP_THIS
-    /** Configure the weights transform kernel.
-     *
-     * @param[in]  weights_hwio        Pointer to the weights tensor
-     * @param[out] output              Pointer to working space for the output tensor in the Winograd domain.
-     * @param[in]  matrix_stride       Stride across matrices in the output workspace.
-     * @param[in]  num_output_channels Number of filters.
-     * @param[in]  num_input_channels  Number of channels in each filter.
-     */
-    void configure(const ITensor *weights_hwio, ITensor *output, const int matrix_stride, const int num_output_channels, const int num_input_channels) override;
-#endif /* DOXYGEN_SKIP_THIS */
-
-    /** Determine how much memory (in units of T) to allocate for the
-     * transformed weights.
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Storage size (in units of T) required.
-     */
-    unsigned int get_weight_storage_size(int num_output_channels, int num_input_channels) const override;
-
-    /** Gets the stride between matrices in the input worspace
-     *
-     * @param[in] num_output_channels Number of output feature maps.
-     * @param[in] num_input_channels  Number of input feature maps.
-     *
-     * @return Stride expressed in bytes.
-     */
-    int get_matrix_stride(int num_output_channels, int num_input_channels) const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    using WinogradBase     = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    using WinogradConv     = typename WinogradBase::template Convolution<T, T>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<T, T>;
-
-    std::unique_ptr<WeightsTransform> _transform{ nullptr };
-    const ITensor                    *_weights_hwio;
-    ITensor                          *_output;
-    int                               _matrix_stride;
-    int                               _num_output_channels;
-    int                               _num_input_channels;
-};
-
-/** NEON kernel to perform Winograd. */
-template <typename TIn, typename TOut, int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols>
-class NEWinogradLayerConfiguration
-{
-public:
-    /** Winograd base kernel */
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, winograd::WinogradRoots::Integers>;
-    /** Winograd convolution kernel */
-
-    using WinogradConv = typename WinogradBase::template Convolution<TIn, TOut>;
-
-    using TransformInputKernel   = NEWinogradLayerTransformInputKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformWeightsKernel = NEWinogradLayerTransformWeightsKernel<TIn, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-    using TransformOutputKernel  = NEWinogradLayerTransformOutputKernel<TOut, OutputTileRows, OutputTileCols, KernelRows, KernelCols>;
-};
-
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMWINOGRADCONVOLUTIONLAYERKERNEL_H*/
diff --git a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h b/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
deleted file mode 100644
index 0fd3f8ce67..0000000000
--- a/arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEYOLOLAYERKERNEL_H
-#define ARM_COMPUTE_NEYOLOLAYERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the YOLO layer kernel. */
-class NEYOLOLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEYOLOLayerKernel";
-    }
-    /** Constructor */
-    NEYOLOLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEYOLOLayerKernel(const NEYOLOLayerKernel &) = delete;
-    /** Default move constructor */
-    NEYOLOLayerKernel(NEYOLOLayerKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEYOLOLayerKernel &operator=(const NEYOLOLayerKernel &) = delete;
-    /** Default move assignment operator */
-    NEYOLOLayerKernel &operator=(NEYOLOLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEYOLOLayerKernel() = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
-     *
-     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                             of the activation function. Data types supported: F16/F32.
-     * @param[out]     output      Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info    Activation layer parameters.
-     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEYOLOLayerKernel
-     *
-     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                        of the activation function. Data types supported: F16/F32.
-     * @param[in] output      Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info    Activation layer information.
-     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to run YOLO layer
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T, int S>
-    void yolo_layer_nchw(const Window &window);
-    /** Function to run YOLO layer on tensors with NHWC format
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void yolo_layer_nhwc(const Window &window);
-    /** Common signature for all the yolo layer functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using YOLOFunctionPtr = void (NEYOLOLayerKernel::*)(const Window &window);
-
-private:
-    YOLOFunctionPtr     _func;
-    ITensor            *_input;
-    ITensor            *_output;
-    ActivationLayerInfo _act_info;
-    int32_t             _num_classes;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEYOLOLAYERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp b/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp
deleted file mode 100644
index 4ff83fbc51..0000000000
--- a/arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <array>
-#include <algorithm>
-#include <initializer_list>
-
-#include <cassert>
-
-namespace arm_gemm {
-
-template<unsigned int D>
-class NDRange {
-private:
-    std::array<unsigned int, D> m_sizes {};
-    std::array<unsigned int, D> m_totalsizes {};
-
-    class NDRangeIterator {
-    private:
-        const NDRange &m_parent;
-        unsigned int m_pos = 0;
-        unsigned int m_end = 0;
-
-    public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
-
-        bool done() const {
-            return (m_pos >= m_end);
-        }
-
-        unsigned int dim(unsigned int d) const {
-            unsigned int r = m_pos;
-
-            if (d < (D - 1)) {
-                r %= m_parent.m_totalsizes[d];
-            }
-
-            if (d > 0) {
-                r /= m_parent.m_totalsizes[d-1];
-            }
-
-            return r;
-        }
-
-        bool next_dim0() {
-            m_pos++;
-
-            return !done();
-        }
-
-        bool next_dim1() {
-            m_pos += m_parent.m_sizes[0] - dim(0);
-
-            return !done();
-        }
-
-        unsigned int dim0_max() const {
-            unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
-
-            return dim(0) + offset;
-        }
-    };
-
-public:
-    NDRange& operator=(const NDRange& rhs)=default;
-    NDRange(const NDRange& rhs)           =default;
-
-    template <typename... T>
-    NDRange(T... ts)
-    : m_sizes{ts...}
-    {
-        unsigned int t=1;
-
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-    NDRange(const std::array<unsigned int, D>& n)
-    : m_sizes(n)
-    {
-        unsigned int t=1;
-
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
-
-            m_totalsizes[i] = t;
-        }
-    }
-
-    NDRangeIterator iterator(unsigned int start, unsigned int end) const {
-        return NDRangeIterator(*this, start, end);
-    }
-
-    unsigned int total_size() const {
-        return m_totalsizes[D - 1];
-    }
-
-    unsigned int get_size(unsigned int v) const {
-        return m_sizes[v];
-    }
-};
-
-/** NDCoordinate builds upon a range, but specifies a starting position
- * in addition to a size which it inherits from NDRange
- */
-template<unsigned int N>
-class NDCoordinate : public NDRange<N> {
-    using int_t     =unsigned int;
-    using ndrange_t = NDRange<N>;
-
-    std::array<int_t, N> m_positions {};
-public:
-    NDCoordinate& operator=(const NDCoordinate& rhs)=default;
-    NDCoordinate(const NDCoordinate& rhs)           =default;
-    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
-    {
-        std::array<int_t, N> sizes{};
-
-        std::size_t i = 0;
-        for(auto& p : list) {
-            m_positions[i]= p.first;
-            sizes[i++]    = p.second;
-        }
-
-        //update the parents sizes
-        static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
-    }
-
-    int_t get_position(int_t d) const {
-        assert(d < m_positions.size());
-        return m_positions[d];
-    }
-
-    void set_position(int_t d, int_t v) {
-        assert(d < size(m_positions));
-        assert(v < ndrange_t::get_size(d));
-
-        m_positions[d] = v;
-    }
-
-    int_t get_position_end(int_t d) const {
-        return get_position(d) + NDRange<N>::get_size(d);
-    }
-}; //class NDCoordinate
-
-/** @returns the number of dimensions in the NDRange which have none-1 values
- * IE there is actual work in these dimensions that can be broken up
- */
-template<unsigned int N>
-std::size_t ndrange_popcount(const NDRange<N>& ndr) {
-    std::size_t count = 0;
-
-    for(unsigned int d = 0; d != N; ++d) {
-        if(ndr.get_size(d) != 1)
-            ++count;
-    }
-    return count;
-}
-
-} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/Helpers.h b/arm_compute/core/NEON/kernels/assembly/Helpers.h
deleted file mode 100644
index 9372e05295..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/Helpers.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_HELPERS_H
-#define ARM_COMPUTE_ASSEMBLY_HELPERS_H
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-#include "arm_compute/core/Utils.h"
-
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-namespace arm_compute
-{
-/** Block sizes to use to break the M, N, K dimension */
-struct BlockSizes
-{
-    unsigned int k_block{ 0 };             /**< Block size alon the K dimension */
-    unsigned int x_block{ 0 };             /**< Block size along the N (x) dimension */
-    unsigned int m_round{ 0 };             /**< Block size along the M dimension (Must be a multiple of strategy_out_height) */
-    unsigned int strategy_out_height{ 0 }; /**< Number of rows (M) processed by the selected strategy */
-};
-
-/** Extracts the kernel description of the selected kernel by the GEMM backend heuristics
- *
- * @param[in] input_type        Data type of the input tensor.
- * @param[in] ci                CPU information.
- * @param[in] num_threads       Maximum number of threads that might be used for the calculations.
- * @param[in] p                 M, N, K sizes.
- * @param[in] activation        Activation struct
- * @param[in] pretranspose_hint Is B also pretransposed ?
- *
- * @return Kernel description that the assembly heuristics picked for the given configuration
- */
-arm_gemm::KernelDescription get_gemm_info(DataType                            input_type,
-                                          const CPUInfo                      &ci,
-                                          const unsigned int                  num_threads,
-                                          const INEGEMMWrapperKernel::Params &p,
-                                          arm_gemm::Activation                activation,
-                                          bool                                pretranspose_hint);
-
-/** Calculate the recommended block sizes to use based on the CPU cache sizes and the strategy which will be used
- *
- * @param[in] ci CPU information.
- * @param[in] M  M dimension.
- * @param[in] N  N dimension.
- * @param[in] K  K dimension.
- *
- * @return Recommeded block sizes to use for the given M, N, K dimensions.
- */
-template <typename strategy>
-BlockSizes calculate_block_sizes(const CPUInfo &ci, unsigned int M, unsigned int N, unsigned int K)
-{
-    BlockSizes bs;
-
-    using Toi = typename strategy::operand_type;
-
-    const unsigned int L1_size = ci.get_L1_cache_size();
-    const unsigned int L2_size = ci.get_L2_cache_size();
-
-    // Work out blocking parameters
-
-    // k_block: Find out how much of the larger array can be loaded into half the cache.
-    // This should account for associative caches.
-    bs.k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
-
-    // Needs to be (at least a single) multiple of the K unroll level.
-    bs.k_block /= strategy::k_unroll();
-    bs.k_block = std::max(bs.k_block, 1U) * strategy::k_unroll();
-
-    // Now tune to presented problem size; this is how many blocks we need.
-    int num_k_blocks = DIV_CEIL(K, bs.k_block);
-
-    // So divide the space equally into that many blocks.
-    bs.k_block = DIV_CEIL(K, num_k_blocks);
-
-    // And round UP to the K unroll level required.
-    bs.k_block = ceil_to_multiple(bs.k_block, strategy::k_unroll());
-
-    // x_block: Work out how many rows (of length k_block) will fit in the L2
-    // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-    bs.x_block = (((L2_size * 9) / 10) - (bs.k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) / (sizeof(Toi) * bs.k_block);
-
-    // Needs to be (at least a single) multiple of the kernel output width.
-    bs.x_block /= strategy::out_width();
-    bs.x_block = std::max(bs.x_block, 1U) * strategy::out_width();
-
-    // And tune to the presented problem size.
-    int num_x_blocks = DIV_CEIL(N, bs.x_block);
-    bs.x_block       = DIV_CEIL(N, num_x_blocks);
-
-    bs.x_block = ceil_to_multiple(bs.x_block, strategy::out_width());
-
-    // Work out the rounded size of M - needed for some buffers.
-    bs.m_round             = ceil_to_multiple(M, strategy::out_height());
-    bs.strategy_out_height = strategy::out_height();
-
-    return bs;
-}
-
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_HELPERS_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
deleted file mode 100644
index f152ab5f61..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_INEGEMMWRAPPERKERNEL_H
-#define ARM_COMPUTE_INEGEMMWRAPPERKERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for all the arm_gemm Gemms
- */
-class INEGEMMWrapperKernel : public INEKernel
-{
-public:
-    /** Parameters defining the dimensions of the matrices being multiplied */
-    struct Params
-    {
-        unsigned int M{ 0 };       /**< Rows in output matrix C (and input matrix A). */
-        unsigned int N{ 0 };       /**< Columns in output matrix C (and input matrix B). */
-        unsigned int K{ 0 };       /**< Columns of input matrix A (= rows of input matrix B). */
-        unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */
-        unsigned int multis{ 0 };  /**< Number of "multi" GEMMs (unique A, B and C). */
-    };
-
-    static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info);
-
-    /** Constructor */
-    INEGEMMWrapperKernel();
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete;
-    /** Prevent instances of this class from being copied */
-    INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note The input and output tensor must have the same dimensions
-     *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[out] c         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  alpha     Scalar multiplier to apply to AB matrix product.
-     * @param[in]  beta      Scalar multiplier to apply to input C matrix before adding product.
-     * @param[in]  gemm_info GEMM meta-data
-     */
-    void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Called as part of configure() after _a, _b, _c and _params have been set.
-     *
-     * @param[in] alpha Scalar multiplier to apply to AB matrix product.
-     * @param[in] beta  Scalar multiplier to apply to input C matrix before adding product.
-     *
-     * @return A 3D execution window.
-     */
-    virtual Window configure_internal(float alpha, float beta) = 0;
-
-    /** Run the kernel from the start to the end offset in window.
-     *
-     * @param[in] window       Window to use for the iteration
-     * @param[in] start_offset Where to start iterating from (In Window coordinates)
-     * @param[in] end_offset   Where to stop iterating (In Window coordinates).
-     * @param[in] info         Info about executing thread and CPU.
-     */
-    virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0;
-
-    const ITensor *_a;
-    const ITensor *_b;
-    ITensor       *_c;
-    Params         _params;
-    GEMMInfo       _gemm_info;
-
-private:
-    Window      _window3d;
-    TensorShape _window_shape;
-};
-
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_INEGEMMRAPPERKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h b/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
deleted file mode 100644
index 8a9fb82b4a..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class NEDepthwiseConvolutionAssemblyKernelWrapper final : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDepthwiseConvolutionAssemblyKernelWrapper";
-    }
-
-    /** Default constructor */
-    NEDepthwiseConvolutionAssemblyKernelWrapper()
-        : _kernel(nullptr)
-    {
-    }
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(const NEDepthwiseConvolutionAssemblyKernelWrapper &) = delete;
-    /** Default Move Constructor. */
-    NEDepthwiseConvolutionAssemblyKernelWrapper(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyKernelWrapper &operator=(NEDepthwiseConvolutionAssemblyKernelWrapper &&) = default;
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel Pointer to an assembly kernel implementation.
-     */
-    void configure(depthwise::IDepthwiseConvolution *kernel)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel = kernel;
-        Window win;
-        win.set(Window::DimX, Window::Dimension(0, _kernel->get_window(), 1));
-        INEKernel::configure(win);
-    }
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-        auto first = window.x().start();
-        auto last  = window.x().end();
-        _kernel->run(first, last, info.thread_id);
-    }
-
-private:
-    depthwise::IDepthwiseConvolution *_kernel;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_DEPTHWISE_CONVOLUTION_ASSEMBLY_WRAPPER_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
deleted file mode 100644
index 0e3dd74577..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include "gemm_common.hpp"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** This class is a wrapper for the assembly kernels.
-  *
-  * Some kernels were written in assembly and highly optimised for specific CPUs like A53 or A55.
-  * This class works as a wrapper for these assembly kernels. The arm compute library creates an instance
-  * of NEGEMMAssemblyWrapperKernel and other auxiliary data structures to execute a single assembly kernel
-  * in the context of an NEFunctions.
-  *
-  * The type T is the type of the actual kernel implemented in assembly which is of type
-  *         template<typename To, typename Tr> class GemmCommon
-  *
-  *
-  */
-template <typename TypeInput, typename TypeOutput>
-class NEGEMMAssemblyWrapperKernel final : public INEKernel
-{
-public:
-    /** Constructor
-     */
-    NEGEMMAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("NEGEMMAssemblyWrapperKernel")
-    {
-    }
-
-    NEGEMMAssemblyWrapperKernel(NEGEMMAssemblyWrapperKernel &)  = delete;
-    NEGEMMAssemblyWrapperKernel(NEGEMMAssemblyWrapperKernel &&) = default;
-    NEGEMMAssemblyWrapperKernel &operator=(NEGEMMAssemblyWrapperKernel &) = delete;
-
-    const char *name() const override
-    {
-        return _name.c_str();
-    }
-
-
-    void run(const Window &window, const ThreadInfo &info) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        auto win=arm_gemm::to_ndcoord(window);
-
-        arm_gemm::ndcoord_t thread_locator { };
-
-        _kernel->execute(win, thread_locator, info.thread_id);
-    }
-
-    // Inherited methods overridden:
-    void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
-        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
-        //convert between arm_compute and arm_gemm types
-        auto ndc_win = arm_gemm::to_ndcoord(window);
-        auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
-
-        _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
-    }
-
-    /** Initialise the kernel's input and output.
-     *
-     * @param[in] kernel      Pointer to an assembly kernel implementation.
-     * @param[in] num_threads Number of concurrent threads which will execute the kernel.
-     */
-    void configure(arm_gemm::GemmCommon<TypeInput, TypeOutput> *kernel, std::string kernel_name_tag)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
-        _kernel         = kernel;
-
-        Window win = to_window(kernel->get_window_size());
-
-        INEKernel::configure(win);
-
-        if(!kernel_name_tag.empty())
-        {
-            _name += "/" + kernel_name_tag;
-        }
-    }
-
-private:
-    arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
deleted file mode 100644
index 7723224ec8..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include <memory>
-#include <cstring>
-
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
-namespace arm_gemm {
-
-enum class GemmMethod
-{
-    DEFAULT,
-    GEMV_BATCHED,
-    GEMV_PRETRANSPOSED,
-    GEMV_NATIVE_TRANSPOSED,
-    GEMM_NATIVE,
-    GEMM_HYBRID,
-    GEMM_INTERLEAVED,
-    GEMM_INTERLEAVED_2D,
-    QUANTIZE_WRAPPER,
-    GEMM_HYBRID_QUANTIZED
-};
-
-struct KernelDescription
-{
-    GemmMethod   method      = GemmMethod::DEFAULT;
-    std::string  name        = "";
-    bool         is_default  = false;
-
-    KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { }
-    KernelDescription() noexcept  { }
-};
-
-struct GemmConfig
-{
-    GemmMethod   method           = GemmMethod::DEFAULT;
-    std::string  filter           = "";
-    unsigned int inner_block_size = 0;
-    unsigned int outer_block_size = 0;
-
-    GemmConfig(GemmMethod method) : method(method) { }
-    GemmConfig() { }
-};
-
-struct Activation
-{
-    enum class Type {
-        None,
-        ReLU,
-        BoundedReLU
-    };
-
-    Type    type;
-    float   param1;
-    float   param2;
-
-    Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { }
-};
-
-struct GemmArgs
-{
-public:
-    const CPUInfo    *_ci;
-    unsigned int      _Msize;
-    unsigned int      _Nsize;
-    unsigned int      _Ksize;
-    unsigned int      _nbatches;
-    unsigned int      _nmulti;
-    bool              _trA;
-    bool              _trB;
-    Activation        _act;
-    int               _maxthreads;
-    bool              _pretransposed_hint;
-    const GemmConfig *_cfg;
-
-    GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
-             const unsigned int K, const unsigned int nbatches,
-             const unsigned int nmulti, const bool trA, const bool trB,
-             Activation act, const int maxthreads,
-             const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) :
-             _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
-             _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads),
-             _pretransposed_hint(pretransposed_hint), _cfg(cfg)
-    {
-    }
-};
-
-struct Requantize32
-{
-public:
-    const int32_t  *bias = nullptr;
-    size_t          bias_multi_stride = 0;
-    int32_t         a_offset = 0;
-    int32_t         b_offset = 0;
-    int32_t         c_offset = 0;
-    bool            per_channel_requant = false;
-    int32_t         per_layer_shift = 0;
-    int32_t         per_layer_mul = 0;
-    const int32_t  *per_channel_shifts = nullptr;
-    const int32_t  *per_channel_muls = nullptr;
-    int32_t         minval = 0;
-    int32_t         maxval = 0;
-
-    Requantize32() = default;
-
-    // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
-        minval(minv), maxval(maxv)
-    {
-    }
-
-    // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 const int32_t *requant_shifts, const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls),
-        minval(minv), maxval(maxv)
-    {
-    }
-};
-
-struct Nothing
-{
-};
-
-template<typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
-
-/* Low level API calls.
- * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
-
-/* get_gemm_method(): Given the templated types and provided parameters,
- * which is the preferred method to implement this GEMM?  */
-template<typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={});
-
-template<typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & ={});
-
-template<typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & ={});
-
-} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
deleted file mode 100644
index 6f345c1721..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/Dimensions.h"
-#include "arm_compute/core/NEON/kernels/arm_gemm/ndrange.hpp"
-
-#include <cassert>
-
-/* This file contains mapping between integral types used in arm_compute and arm_gemm
- * These two codebases both require a degree of separation for the sake of modularity
- * so maintain their own types which represent similar information.
- */
-
-namespace arm_gemm {
-
-//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
-
-using ndrange_t=NDRange<ndrange_max>;
-using ndcoord_t=NDCoordinate<ndrange_max>;
-
-/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
- *
- * As `NDRange<T>` does not not encode start positions, we specify
- * the start to be zero in the produced `arm_compute::Window`
- *
- * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
- */
-inline arm_compute::Window to_window(const ndrange_t& ndr) {
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i!=ndrange_max; ++i) {
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
-    }
-
-    return win;
-}
-
-/*
- * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
- *
- * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
- * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
- */
-inline arm_compute::Window to_window(const ndcoord_t& ndc) {
-    arm_compute::Window win;
-
-    for(unsigned int i = 0; i!=ndrange_max; ++i) {
-        const auto start = ndc.get_position(i);
-        const auto size  = ndc.get_size(i);
-        const auto stop  = start + size;
-
-        //populate the window with the dimensions of the NDRange
-        win.set(i, arm_compute::Window::Dimension(start, stop));
-    }
-
-    return win;
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
- *
- * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
- * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
- * @return the resultant ndrange_t
- */
-inline ndrange_t to_ndrange(const arm_compute::Window& win) {
-    return {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
-}
-
-/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
- *
- * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
- * @return the resultant ndcoord_t
- */
-inline ndcoord_t to_ndcoord(const arm_compute::Window& win) {
-    return {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
-}
-
-} //namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
deleted file mode 100644
index 8d3db4adf2..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm_local.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* This file is used to configure integration-specific aspects of arm_gemm into ACL */
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-
-namespace arm_gemm
-{
-using CPUModel = arm_compute::CPUModel;
-using CPUInfo  = arm_compute::CPUInfo;
-} // namespace arm_compute
-
-
-
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
deleted file mode 100644
index ea9b524e15..0000000000
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
-
-#include <cstddef>
-#include <cassert>
-
-#define UNUSED(x)   (void)(x)
-
-namespace arm_gemm {
-
-// Abstract class for the GEMM/GEMV functions.
-//
-// GEMM implementations may be "native" (never require any input
-// permutation), "pretransposed" (require permutation up-front) or require
-// working space (permute as they go along).  This interface should support
-// all of them.
-
-// The real GemmCommon class is templated based on the operand and return
-// type.  This is an interface class which is independent of those types.
-class IGemmCommon {
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides.  This "generic" version uses void *s, the preferred version
-     * is the one provided by templated GemmCommon (below) which takes
-     * appropriately typed pointers.  If B is pretransposed (see below) then
-     * the settings for B here are ignored.
-     */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                          void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) = 0;
-
-    /** @returns an ndrange containing ranges of the compute space which can be
-     * broken up and parallelised over
-     */
-    virtual ndrange_t get_window_size() const = 0;
-
-    /* The maximum thread count is specified when the GEMM is created.  Some
-     * implementations need to know how many threads will actually run in
-     * order to work properly.
-     *
-     * In some cases, after creating the GEMM the number of threads needs to
-     * be reduced (e.g. not enough work to split across threads).  This
-     * method allows the number of actual threads to be run to be set (must
-     * be equal or lower).
-     *
-     * This has an empty default implementation, as GEMMs which don't care
-     * about thread count can safely ignore this.
-     */
-    virtual void set_nthreads(int) { };
-
-    /* Whether this GEMM can be dynamically scheduled or not. */
-    virtual bool supports_dynamic_scheduling() const { return false; }
-
-    /** Main execute member fucntion
-     * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
-     * @param [in] thread_locator where are we inside of the thread space
-     * @naram [in] threadid       a unique threadid
-     */
-    virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
-
-    /*** Working space interface (optional) ***/
-    /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
-    virtual size_t get_working_size() const { return 0; }
-    /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) { };
-
-    /*** "Pretransposed" interface (optional) ***/
-    /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
-    virtual bool B_is_pretransposed() const { return false; }
-    /* Does pretranspose still need to be done? */
-    virtual bool B_pretranspose_required() const { return false; }
-    /* Total number of bytes of space needed for pretransposed arrays. */
-    virtual size_t get_B_pretransposed_array_size() const { return 0; }
-    /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
-    /* The "real" version of this depends on the templated operand type (see below).  */
-    virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
-    /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
-    virtual void set_pretransposed_B_data(void *) { }
-
-    /*** "Quantized bias" interface (optional) ***/
-    /* Set the bias vector for quantized GEMMs */
-    virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride)
-    {
-        UNUSED(bias);
-        UNUSED(bias_multi_stride);
-    }
-
-    // Destructor
-    virtual ~IGemmCommon() { }
-};
-
-/* "Real" GemmCommon class which is templated on the operand and return types.
- *
- * In addition to correctly typed versions of the functions that operate on
- * operand and return data, this class provides a default implementation of
- * 'set_arrays' to capture the provided arguments in protected class
- * members, as essentially any implementation will need these.
- */
-template<typename To, typename Tr>
-class GemmCommon : public IGemmCommon {
-protected:
-    const To *_Aptr=nullptr;
-    int _lda=0;
-    int _A_batch_stride=0;
-    int _A_multi_stride=0;
-    const To *_Bptr=nullptr;
-    int _ldb=0;
-    int _B_multi_stride=0;
-    Tr *_Cptr=nullptr;
-    int _ldc=0;
-    int _C_batch_stride=0;
-    int _C_multi_stride=0;
-    const Tr *_bias=nullptr;
-    int _bias_multi_stride=0;
-
-public:
-    /* Pass in the pointers to the arrays to be operated on and their
-     * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */   const int bias_multi_stride) {
-        _Aptr = A;
-        _lda = lda;
-        _A_batch_stride = A_batch_stride;
-        _A_multi_stride = A_multi_stride;
-        _Bptr = B;
-        _ldb = ldb;
-        _B_multi_stride = B_multi_stride;
-        _Cptr = C;
-        _ldc = ldc;
-        _C_batch_stride = C_batch_stride;
-        _C_multi_stride = C_multi_stride;
-        _bias = bias;
-        _bias_multi_stride = bias_multi_stride;
-    }
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) override {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
-                   static_cast<const Tr *>(bias), bias_multi_stride);
-    }
-
-    /*** "Pretransposed" interface ***/
-
-    /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
-    /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
-
-    /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override {
-        pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
-    }
-};
-
-template<typename GemmKernel>
-inline
-int unsigned get_total_window_size(const GemmKernel& kernel)
-{
-    auto window=kernel.get_window_size();
-
-    unsigned int total = 1;
-    for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
-    {
-        total *= window.get_size(i);
-    }
-
-    return total;
-}
-
-} // namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp b/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
deleted file mode 100644
index 799e95d3e6..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/alloc.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#ifdef ALLOC_ALIGN
-#define ALLOCATE(x) aligned_alloc(ALLOC_ALIGN, x)
-#else
-#define ALLOCATE(x) malloc(x)
-#endif
diff --git a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp b/arm_compute/core/NEON/kernels/convolution/common/arm.hpp
deleted file mode 100644
index 90e7828553..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/arm.hpp
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/** Sets the macro __arm_any__ if compiling for Aarch32 or Aarch64.
- *  Includes `arm_neon.h` if compiling for either architecture.
- */
-
-#ifdef __arm__
-#define __arm_any__
-#endif  // __arm__
-
-#ifdef __aarch64__
-#define __arm_any__
-#endif  // __aarch64__
-
-#ifdef __arm_any__
-#include <arm_neon.h>
-#endif  // __arm_any__
diff --git a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp b/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
deleted file mode 100644
index 2ab2597785..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/convolution.hpp
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-enum PaddingType {
-  PADDING_SAME, PADDING_VALID
-};
diff --git a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp b/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
deleted file mode 100644
index 97b21e0ff5..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/padding.hpp
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <cstddef>
-
-// Utilities for copying tensor tiles and adding/removing padding.
-namespace padding
-{
-
-/* Copy a tile and apply padding to the output copy.
- */
-template <typename T>
-void copy_and_pad_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const T *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  T* outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int pad_top,
-  unsigned int pad_left,
-  unsigned int pad_bottom,
-  unsigned int pad_right,
-  T pad_value=static_cast<T>(0)
-);
-
-/** Copy a tile and remove padding elements in the output.
- */
-template <unsigned int TileRows, unsigned int TileCols>
-class CopyCropped
-{
-  public:
-    static void execute(
-      size_t size,  // Amount of data to copy
-      const void *inptr,
-      size_t in_row_stride,
-      size_t in_col_stride,
-      void *outptr,
-      size_t out_row_stride,
-      size_t out_col_stride,
-      unsigned int pad_top,
-      unsigned int pad_left,
-      unsigned int pad_bottom,
-      unsigned int pad_right
-    );
-};
-
-template <typename T>
-void crop_and_copy_tile(
-  unsigned int tile_rows,
-  unsigned int tile_cols,
-  unsigned int n_channels,
-  const T *inptr,
-  unsigned int in_row_stride,
-  unsigned int in_col_stride,
-  T *outptr,
-  unsigned int out_row_stride,
-  unsigned int out_col_stride,
-  unsigned int crop_top,
-  unsigned int crop_left,
-  unsigned int crop_bottom,
-  unsigned int crop_right
-);
-
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/perf.h b/arm_compute/core/NEON/kernels/convolution/common/perf.h
deleted file mode 100644
index 3c0d36646d..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/perf.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-/* Prototypes from perf.c */
-
-void start_counter(int fd);
-long long get_counter(int fd);
-long long stop_counter(int fd);
-int open_instruction_counter(void);
-int open_cycle_counter(void);
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
deleted file mode 100644
index 6029cb67e3..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/qasymm8.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include <cstdint>
-
-namespace qasymm8
-{
-
-struct QAsymm8Params
-{
-  uint8_t quantize(float value) const;
-  float dequantize(uint8_t value) const;
-
-  uint8_t offset;
-  float scale;
-};
-
-struct QAsymm8RescaleParams
-{
-  static QAsymm8RescaleParams make_rescale_params(
-    const QAsymm8Params& weight_quant,
-    const QAsymm8Params& input_quant,
-    const QAsymm8Params& output_quant
-  );
-
-  QAsymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
-
-  const int32_t shift, multiplier;
-  const float rescale;
-};
-
-}
diff --git a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp b/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
deleted file mode 100644
index 41bfbe4d8a..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/qsymm8.hpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include <cstdint>
-#include <vector>
-#include "qasymm8.hpp"
-
-
-namespace qsymm8 {
-
-struct QSymm8Params {
-  int8_t quantize(float value) const;
-  float dequantize(int8_t value) const;
-
-  float scale;
-};
-
-struct QSymm8RescaleParams {
-  static QSymm8RescaleParams
-  make_rescale_params(const QSymm8Params &weight_quant,
-                      const QSymm8Params &input_quant,
-                      const QSymm8Params &output_quant);
-
-  QSymm8RescaleParams(int32_t shift, int32_t multiplier, float rescale);
-
-  const int32_t shift, multiplier;
-  const float rescale;
-};
-
-struct QSymm8PerChannelParams {
-  int8_t quantize(float value, float scale) const;
-  float dequantize(int8_t value, float scale) const;
-
-  std::vector<float> scales;
-};
-
-struct QSymm8PerChannelRescaleParams {
-  static QSymm8PerChannelRescaleParams
-  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
-                      const QSymm8PerChannelParams &input_quant,
-                      const QSymm8PerChannelParams &output_quant);
-
-  static QSymm8PerChannelRescaleParams
-  make_rescale_params(const QSymm8PerChannelParams &weight_quant,
-                      const qasymm8::QAsymm8Params &input_quant,
-                      const qasymm8::QAsymm8Params &output_quant);
-
-  QSymm8PerChannelRescaleParams(std::vector<int32_t>& shift, std::vector<int32_t>& multiplier, std::vector<float>& rescale);
-
-  std::vector<int32_t>  shifts, multipliers;
-  std::vector<float> rescales;
-};
-
-} // namespace qsymm8
diff --git a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp b/arm_compute/core/NEON/kernels/convolution/common/shims.hpp
deleted file mode 100644
index 243d305e19..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/shims.hpp
+++ /dev/null
@@ -1,749 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#ifndef DOXYGEN_SKIP_THIS
-#include <cstdint>
-#endif /* DOXYGEN_SKIP_THIS */
-#include "arm.hpp"
-
-namespace reorder {
-/** Re-order a tensor from NCHW format to NHWC.
- *
- * @note The stride parameters are optional and are provided to allow padding in either input or output tensors.
- *
- * @param[in] in Input tensor in NCHW format.
- * @param[out] out Output tensor, to be written in NHWC format.
- * @param n_batches Number of batches in the tensors.
- * @param n_channels Number of channels in the tensors
- * @param n_rows Height of the tensor
- * @param n_cols Width of the tensor
- * @param in_batch_stride Stride over batches in the input tensor. If `0` defaults to `n_channels * in_channel_stride`.
- * @param in_channel_stride Stride over channels in the input tensor. If `0` defaults to `n_rows * in_row_stride`.
- * @param in_row_stride Stride over rows in the input tensor. If `0` defaults to `n_cols`.
- * @param out_batch_stride Stride over batches in the output tensor. If `0` defaults to `n_rows * out_row_stride`.
- * @param out_row_stride Stride over rows in the output tensor. If `0` defaults to `n_cols * out_col_stride`.
- * @param out_col_stride Stride over columns in the output tensor. If `0` defaults to `n_channels`.
- */
-template <typename T>
-inline void nchw_to_nhwc(
-  const T* const in,
-  T* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride=0,
-  int in_channel_stride=0,
-  int in_row_stride=0,
-  int out_batch_stride=0,
-  int out_row_stride=0,
-  int out_col_stride=0
-);
-
-/** Re-order a tensor from NHWC format to NCHW.
- *
- * @note The stride parameters are optional and are provided to allow padding in either input or output tensors.
- *
- * @param[in] in Input tensor in NHWC format.
- * @param[out] out Output tensor, to be written in NCHW format.
- * @param n_batches Number of batches in the tensors.
- * @param n_rows Height of the tensor
- * @param n_cols Width of the tensor
- * @param n_channels Number of channels in the tensors
- * @param in_batch_stride Stride over batches in the input tensor. If `0` defaults to `n_rows * in_row_stride`.
- * @param in_row_stride Stride over rows in the input tensor. If `0` defaults to `n_cols * in_col_stride`.
- * @param in_col_stride Stride over columns in the input tensor. If `0` defaults to `n_channels`.
- * @param out_batch_stride Stride over batches in the output tensor. If `0` defaults to `n_channels * out_channel_stride`.
- * @param out_channel_stride Stride over channels in the output tensor. If `0` defaults to `n_rows * out_row_stride`.
- * @param out_row_stride Stride over rows in the output tensor. If `0` defaults to `n_cols`.
- */
-template <typename T>
-inline void nhwc_to_nchw(
-  const T* const in,  // Input data in NHWC form
-  T* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride=0,
-  int in_row_stride=0,
-  int in_col_stride=0,
-  int out_batch_stride=0,
-  int out_channel_stride=0,
-  int out_row_stride=0
-);
-
-/** Re-order a weight tensor from [Output feature map x Input feature map x
- *  Height x Width] format to [Height x Width x Input feature map x Output
- *  feature map] format.
- */
-template <typename T>
-inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
-  const T* const in,  // Input in [Output x Input x Height x Width] form
-  T* const out,       // Output in [Height x Width x Input x Output] form
-  const int n_output_feature_maps,
-  const int n_input_feature_maps,
-  const int n_rows,
-  const int n_cols,
-  int in_output_feature_map_stride=0,
-  int in_input_feature_map_stride=0,
-  int in_row_stride=0,
-  int out_row_stride=0,
-  int out_col_stride=0,
-  int out_input_feature_map_stride=0
-);
-
-/** Re-order a weight tensor from [Height x Width x Input feature map x Output
- *  feature map] format to [Output feature map x Input feature map x Height x
- *  Width] format.
- */
-template <typename T>
-inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
-  const T* const in,  // Input in [Height x Width x Input x Output] form
-  T* const out,       // Output in [Output x Input x Height x Width] form
-  const int n_rows,
-  const int n_cols,
-  const int n_input_feature_maps,
-  const int n_output_feature_maps,
-  int in_row_stride=0,
-  int in_col_stride=0,
-  int in_input_feature_map_stride=0,
-  int out_output_feature_map_stride=0,
-  int out_input_feature_map_stride=0,
-  int out_row_stride=0
-);
-
-/*****************************************************************************/
-/* 32-bit implementation : NCHW -> NHWC
- */
-template <>
-inline void nchw_to_nhwc(
-  const int32_t* const in,
-  int32_t* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  typedef int32_t T;
-
-  // Fill in the stride values
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
-  in_channel_stride = (in_channel_stride) ? in_channel_stride
-                                          : n_rows * in_row_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_channels * in_channel_stride;
-
-  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_rows * out_row_stride;
-
-  // Perform the re-ordering
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_row = in_batch + i*in_row_stride;
-      T* const out_row = out_batch + i*out_row_stride;
-
-      int j = 0, j_remaining = n_cols;
-#ifdef __arm_any__
-      for (; j_remaining >= 4; j += 4, j_remaining -= 4)
-      {
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 4; c += 4, c_remaining -= 4)
-        {
-          // Read 4 channels worth of 4 columns, then zip to produce 4 columns
-          // worth of 4 channels.
-          int32x4_t channel_pixels[4];
-          channel_pixels[0] = vld1q_s32(in_row + (c + 0)*in_channel_stride + j);
-          channel_pixels[1] = vld1q_s32(in_row + (c + 1)*in_channel_stride + j);
-          channel_pixels[2] = vld1q_s32(in_row + (c + 2)*in_channel_stride + j);
-          channel_pixels[3] = vld1q_s32(in_row + (c + 3)*in_channel_stride + j);
-
-          const auto zip1 = vzipq_s32(channel_pixels[0], channel_pixels[2]);
-          const auto zip2 = vzipq_s32(channel_pixels[1], channel_pixels[3]);
-          const auto out_0 = vzipq_s32(zip1.val[0], zip2.val[0]);
-          const auto out_1 = vzipq_s32(zip1.val[1], zip2.val[1]);
-
-          vst1q_s32(out_row + (j + 0)*out_col_stride + c, out_0.val[0]);
-          vst1q_s32(out_row + (j + 1)*out_col_stride + c, out_0.val[1]);
-          vst1q_s32(out_row + (j + 2)*out_col_stride + c, out_1.val[0]);
-          vst1q_s32(out_row + (j + 3)*out_col_stride + c, out_1.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 4; _j++)
-          {
-            const T* const in_col = in_row + j + _j;
-            T* const out_col = out_row + (j + _j)*out_col_stride;
-            const T* const in_channel = in_col + c*in_channel_stride;
-            out_col[c] = *(in_channel);
-          }
-        }
-      }
-      for (; j_remaining >= 2; j += 2, j_remaining -= 2)
-      {
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 2; c += 2, c_remaining -= 2)
-        {
-          // Read 2 channels worth of 2 columns, then zip to produce 2 columns
-          // worth of 2 channels.
-          int32x2_t channel_pixels[2];
-          channel_pixels[0] = vld1_s32(in_row + (c + 0)*in_channel_stride + j);
-          channel_pixels[1] = vld1_s32(in_row + (c + 1)*in_channel_stride + j);
-
-          const auto output = vzip_s32(channel_pixels[0], channel_pixels[1]);
-
-          vst1_s32(out_row + (j + 0)*out_col_stride + c, output.val[0]);
-          vst1_s32(out_row + (j + 1)*out_col_stride + c, output.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 2; _j++)
-          {
-            const T* const in_col = in_row + j + _j;
-            T* const out_col = out_row + (j + _j)*out_col_stride;
-            const T* const in_channel = in_col + c*in_channel_stride;
-            out_col[c] = *(in_channel);
-          }
-        }
-      }
-#endif  // __arm_any__
-      for (; j_remaining; j++, j_remaining--)
-      {
-        const T* const in_col = in_row + j;
-        T* const out_col = out_row + j*out_col_stride;
-
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_col + c*in_channel_stride;
-          out_col[c] = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-template <>
-inline void nchw_to_nhwc(
-  const uint32_t* const in,
-  uint32_t* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  nchw_to_nhwc(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_channels, n_rows, n_cols,
-    in_batch_stride, in_channel_stride, in_row_stride,
-    out_batch_stride, out_row_stride, out_col_stride
-  );
-}
-
-template <>
-inline void nchw_to_nhwc(
-  const float* const in,
-  float* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  nchw_to_nhwc(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_channels, n_rows, n_cols,
-    in_batch_stride, in_channel_stride, in_row_stride,
-    out_batch_stride, out_row_stride, out_col_stride
-  );
-}
-
-/*****************************************************************************/
-/* Generic implementation : NCHW -> NHWC
- */
-template <typename T>
-inline void nchw_to_nhwc(
-  const T* const in,
-  T* const out,
-  const int n_batches,
-  const int n_channels,
-  const int n_rows,
-  const int n_cols,
-  int in_batch_stride,
-  int in_channel_stride,
-  int in_row_stride,
-  int out_batch_stride,
-  int out_row_stride,
-  int out_col_stride
-)
-{
-  // Fill in the stride values
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols;
-  in_channel_stride = (in_channel_stride) ? in_channel_stride
-                                          : n_rows * in_row_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_channels * in_channel_stride;
-
-  out_col_stride = (out_col_stride) ? out_col_stride : n_channels;
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols * out_col_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_rows * out_row_stride;
-
-  // Perform the re-ordering
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_row = in_batch + i*in_row_stride;
-      T* const out_row = out_batch + i*out_row_stride;
-
-      for (int j = 0; j < n_cols; j++)
-      {
-        const T* const in_col = in_row + j;
-        T* const out_col = out_row + j*out_col_stride;
-
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_col + c*in_channel_stride;
-          out_col[c] = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* 32-bit implementation : NHWC -> NCHW
- */
-template <>
-inline void nhwc_to_nchw(
-  const int32_t* const in,  // Input data in NHWC form
-  int32_t* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  typedef int32_t T;
-
-  // Fill in stride values
-  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_rows * in_row_stride;
-
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
-  out_channel_stride = (out_channel_stride) ? out_channel_stride
-                                            : n_rows * out_row_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_channels * out_channel_stride;
-
-  // Perform the re-ordering
-  // For every batch
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    // For every row
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_i = in_batch + i*in_row_stride;
-      T* const out_i = out_batch + i*out_row_stride;
-
-      // For every column, beginning with chunks of 4
-      int j = 0, j_remaining = n_cols;
-#ifdef __arm_any__
-      for (; j_remaining >= 4; j += 4, j_remaining -=4)
-      {
-        // For every channel, beginning with chunks of 4
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 4; c += 4, c_remaining -= 4)
-        {
-          // Read 4 columns worth of 4 channels then zip to produce 4 channels
-          // worth of 4 columns.
-          int32x4_t pixel_channels[4];
-          pixel_channels[0] = vld1q_s32(in_i + (j + 0)*in_col_stride + c);
-          pixel_channels[1] = vld1q_s32(in_i + (j + 1)*in_col_stride + c);
-          pixel_channels[2] = vld1q_s32(in_i + (j + 2)*in_col_stride + c);
-          pixel_channels[3] = vld1q_s32(in_i + (j + 3)*in_col_stride + c);
-
-          const auto zip1 = vzipq_s32(pixel_channels[0], pixel_channels[2]);
-          const auto zip2 = vzipq_s32(pixel_channels[1], pixel_channels[3]);
-          const auto out_0 = vzipq_s32(zip1.val[0], zip2.val[0]);
-          const auto out_1 = vzipq_s32(zip1.val[1], zip2.val[1]);
-
-          vst1q_s32(out_i + j + (c + 0)*out_channel_stride, out_0.val[0]);
-          vst1q_s32(out_i + j + (c + 1)*out_channel_stride, out_0.val[1]);
-          vst1q_s32(out_i + j + (c + 2)*out_channel_stride, out_1.val[0]);
-          vst1q_s32(out_i + j + (c + 3)*out_channel_stride, out_1.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 4; _j++)
-          {
-            const T* const in_j = in_i + (j + _j)*in_col_stride;
-            T* const out_j = out_i + (j + _j);
-
-            const T* const in_channel = in_j + c;
-            T* const out_channel = out_j + c*out_channel_stride;
-            *(out_channel) = *(in_channel);
-          }
-        }
-      }
-      for (; j_remaining >= 2; j += 2, j_remaining -=2)
-      {
-        int c = 0, c_remaining = n_channels;
-        for (; c_remaining >= 2; c += 2, c_remaining -= 2)
-        {
-          // Read 2 columns worth of 2 channels then zip to produce 2 channels
-          // worth of 2 columns.
-          int32x2_t pixel_channels[2];
-          pixel_channels[0] = vld1_s32(in_i + (j + 0)*in_col_stride + c);
-          pixel_channels[1] = vld1_s32(in_i + (j + 1)*in_col_stride + c);
-
-          const auto output = vzip_s32(pixel_channels[0], pixel_channels[1]);
-
-          vst1_s32(out_i + j + (c + 0)*out_channel_stride, output.val[0]);
-          vst1_s32(out_i + j + (c + 1)*out_channel_stride, output.val[1]);
-        }
-        for (; c_remaining; c++, c_remaining--)
-        {
-          for (int _j = 0; _j < 2; _j++)
-          {
-            const T* const in_j = in_i + (j + _j)*in_col_stride;
-            T* const out_j = out_i + (j + _j);
-
-            const T* const in_channel = in_j + c;
-            T* const out_channel = out_j + c*out_channel_stride;
-            *(out_channel) = *(in_channel);
-          }
-        }
-      }
-#endif  // __arm_any__
-      for (; j_remaining; j++, j_remaining--)
-      {
-        const T* const in_j = in_i + j*in_col_stride;
-        T* const out_j = out_i + j;
-
-        // For every channel
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_j + c;
-          T* const out_channel = out_j + c*out_channel_stride;
-          *(out_channel) = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-template <>
-inline void nhwc_to_nchw(
-  const uint32_t* const in,  // Input data in NHWC form
-  uint32_t* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  // Redirect to generic 32-bit implementation
-  nhwc_to_nchw(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_rows, n_cols, n_channels,
-    in_batch_stride, in_row_stride, in_col_stride,
-    out_batch_stride, out_channel_stride, out_row_stride
-  );
-}
-
-template <>
-inline void nhwc_to_nchw(
-  const float* const in,  // Input data in NHWC form
-  float* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  // Redirect to generic 32-bit implementation
-  nhwc_to_nchw(
-    reinterpret_cast<const int32_t*>(in),
-    reinterpret_cast<int32_t*>(out),
-    n_batches, n_rows, n_cols, n_channels,
-    in_batch_stride, in_row_stride, in_col_stride,
-    out_batch_stride, out_channel_stride, out_row_stride
-  );
-}
-
-/*****************************************************************************/
-/* Generic implementation : NHWC -> NCHW
- */
-template <typename T>
-inline void nhwc_to_nchw(
-  const T* const in,  // Input data in NHWC form
-  T* const out,       // Output data in NCHW form
-  const int n_batches,
-  const int n_rows,
-  const int n_cols,
-  const int n_channels,
-  int in_batch_stride,
-  int in_row_stride,
-  int in_col_stride,
-  int out_batch_stride,
-  int out_channel_stride,
-  int out_row_stride
-)
-{
-  // Fill in stride values
-  in_col_stride = (in_col_stride) ? in_col_stride : n_channels;
-  in_row_stride = (in_row_stride) ? in_row_stride : n_cols * in_col_stride;
-  in_batch_stride = (in_batch_stride) ? in_batch_stride
-                                      : n_rows * in_row_stride;
-
-  out_row_stride = (out_row_stride) ? out_row_stride : n_cols;
-  out_channel_stride = (out_channel_stride) ? out_channel_stride
-                                            : n_rows * out_row_stride;
-  out_batch_stride = (out_batch_stride) ? out_batch_stride
-                                        : n_channels * out_channel_stride;
-
-  // Perform the re-ordering
-  // For every batch
-  for (int n = 0; n < n_batches; n++)
-  {
-    const T* const in_batch = in + n*in_batch_stride;
-    T* const out_batch = out + n*out_batch_stride;
-
-    // For every row
-    for (int i = 0; i < n_rows; i++)
-    {
-      const T* const in_i = in_batch + i*in_row_stride;
-      T* const out_i = out_batch + i*out_row_stride;
-
-      // For every column
-      for (int j = 0; j < n_cols; j++)
-      {
-        const T* const in_j = in_i + j*in_col_stride;
-        T* const out_j = out_i + j;
-
-        // For every channel
-        for (int c = 0; c < n_channels; c++)
-        {
-          const T* const in_channel = in_j + c;
-          T* const out_channel = out_j + c*out_channel_stride;
-          *(out_channel) = *(in_channel);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* Generic weight re-order implementation.
- */
-template <typename T>
-inline void ofm_ifm_h_w_to_h_w_ifm_ofm(
-  const T* const in,  // Input in [Output x Input x Height x Width] form
-  T* const out,       // Output in [Height x Width x Input x Output] form
-  const int n_output_feature_maps,
-  const int n_input_feature_maps,
-  const int n_rows,
-  const int n_cols,
-  int in_output_feature_map_stride,
-  int in_input_feature_map_stride,
-  int in_row_stride,
-  int out_row_stride,
-  int out_col_stride,
-  int out_input_feature_map_stride
-)
-{
-  // Fill in stride values
-  in_row_stride = (in_row_stride)
-    ? in_row_stride
-    : n_cols;
-  in_input_feature_map_stride = (in_input_feature_map_stride)
-    ? in_input_feature_map_stride
-    : n_rows * in_row_stride;
-  in_output_feature_map_stride = (in_output_feature_map_stride)
-    ? in_output_feature_map_stride
-    : n_input_feature_maps * in_input_feature_map_stride;
-
-  out_input_feature_map_stride = (out_input_feature_map_stride)
-    ? out_input_feature_map_stride
-    : n_output_feature_maps;
-  out_col_stride = (out_col_stride)
-    ? out_col_stride
-    : n_input_feature_maps * out_input_feature_map_stride;
-  out_row_stride = (out_row_stride)
-    ? out_row_stride
-    : n_cols * out_col_stride;
-
-  // Perform the re-ordering
-  for (int i = 0; i < n_rows; i++)
-  {
-    const T* const in_row = in + i * in_row_stride;
-    T* out_row = out + i * out_row_stride;
-
-    for (int j = 0; j < n_cols; j++)
-    {
-      const T* const in_col = in_row + j;
-      T* const out_col = out_row + j * out_col_stride;
-
-      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
-      {
-        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
-        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
-
-        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
-        {
-          const T* const in_ofm = in_ifm + ofm * in_output_feature_map_stride;
-          T* const out_ofm = out_ifm + ofm;
-          *(out_ofm) = *(in_ofm);
-        }
-      }
-    }
-  }
-}
-
-/*****************************************************************************/
-/* Generic weight re-order implementation.
- */
-template <typename T>
-inline void h_w_ifm_ofm_to_ofm_ifm_h_w(
-  const T* const in,  // Input in [Height x Width x Input x Output] form
-  T* const out,       // Output in [Output x Input x Height x Width] form
-  const int n_rows,
-  const int n_cols,
-  const int n_input_feature_maps,
-  const int n_output_feature_maps,
-  int in_row_stride,
-  int in_col_stride,
-  int in_input_feature_map_stride,
-  int out_output_feature_map_stride,
-  int out_input_feature_map_stride,
-  int out_row_stride
-)
-{
-  // Fill in the stride values
-  in_input_feature_map_stride = (in_input_feature_map_stride)
-    ? in_input_feature_map_stride
-    : n_output_feature_maps;
-  in_col_stride = (in_col_stride)
-    ? in_col_stride
-    : n_input_feature_maps * in_input_feature_map_stride;
-  in_row_stride = (in_row_stride)
-    ? in_row_stride
-    : n_cols * in_col_stride;
-
-  out_row_stride = (out_row_stride)
-    ? out_row_stride
-    : n_cols;
-  out_input_feature_map_stride = (out_input_feature_map_stride)
-    ? out_input_feature_map_stride
-    : n_rows * out_row_stride;
-  out_output_feature_map_stride = (out_output_feature_map_stride)
-    ? out_output_feature_map_stride
-    : n_input_feature_maps * out_input_feature_map_stride;
-
-  // Perform the re-ordering
-  for (int i = 0; i < n_rows; i++)
-  {
-    const T* const in_row = in + i * in_row_stride;
-    T* const out_row = out + i * out_row_stride;
-
-    for (int j = 0; j < n_cols; j++)
-    {
-      const T* const in_col = in_row + j * in_col_stride;
-      T* const out_col = out_row + j;
-
-      for (int ifm = 0; ifm < n_input_feature_maps; ifm++)
-      {
-        const T* const in_ifm = in_col + ifm * in_input_feature_map_stride;
-        T* const out_ifm = out_col + ifm * out_input_feature_map_stride;
-
-        for (int ofm = 0; ofm < n_output_feature_maps; ofm++)
-        {
-          const T* const in_ofm = in_ifm + ofm;
-          T* const out_ofm = out_ifm + ofm * out_output_feature_map_stride;
-          *(out_ofm) = *(in_ofm);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace reorder
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
deleted file mode 100644
index ad0a677a8f..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor.hpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include <cstdlib>
-#include <random>
-
-#include "alloc.hpp"
-
-enum TensorOrder
-{
-  NHWC,  ///< [Batch x Height x Width x Channels]
-  NCHW,  ///< [Batch x Channels x Height x Width]
-};
-
-struct Tensor4DShape
-{
-  int n_batches, n_rows, n_cols, n_channels;
-  TensorOrder ordering;
-
-  // Create a new tensor with the default (NHWC) ordering
-  inline Tensor4DShape(
-    const int n_batches,
-    const int n_rows,
-    const int n_cols,
-    const int n_channels,
-    const TensorOrder ordering=NHWC
-  ) : n_batches(n_batches),
-      n_rows(n_rows),
-      n_cols(n_cols),
-      n_channels(n_channels),
-      ordering(ordering)
-  {
-  }
-
-  inline int index(const int n, const int i, const int j, const int c) const
-  {
-    if (this->ordering == NHWC)
-    {
-      return ((n*this->n_rows + i)*this->n_cols + j)*this->n_channels + c;
-    }
-    else  // NCHW
-    {
-      return ((n*this->n_channels + c)*this->n_rows + i)*this->n_cols + j;
-    }
-  }
-
-  inline int size() const
-  {
-    return n_batches * n_rows * n_cols * n_channels;
-  }
-
-  inline bool TestEq(const Tensor4DShape& other) const
-  {
-    return (n_batches == other.n_batches &&
-            n_rows == other.n_rows &&
-            n_cols == other.n_cols &&
-            n_channels == other.n_channels);
-  }
-};
-
-
-enum WeightOrder
-{
-  HWIO,  ///< [Height x Width x Input channels x Output channels]
-  OIHW,  ///< [Output channels x Input channels x Height x Width]
-};
-
-struct KernelShape
-{
-  int n_output_channels, n_rows, n_cols, n_input_channels;
-  WeightOrder ordering;
-
-  inline KernelShape(
-    const int n_output_channels,
-    const int n_rows,
-    const int n_cols,
-    const int n_input_channels,
-    const WeightOrder ordering=HWIO
-  ) : n_output_channels(n_output_channels),
-      n_rows(n_rows),
-      n_cols(n_cols),
-      n_input_channels(n_input_channels),
-      ordering(ordering)
-  {
-  }
-
-  inline int index(int oc, int i, int j, int ic) const
-  {
-    if (this->ordering == HWIO)
-    {
-      return ((i*this->n_cols + j)*this->n_input_channels + ic)*this->n_output_channels + oc;
-    }
-    else  // OIHW
-    {
-      return ((oc*this->n_input_channels + ic)*this->n_rows + i)*this->n_cols + j;
-    }
-  }
-
-  inline int size(void) const
-  {
-    return n_output_channels * n_rows * n_cols * n_input_channels;
-  }
-};
-
-
-template <typename ShapeT, typename T>
-class Tensor4D final
-{
-  public:
-    Tensor4D(ShapeT shape) :
-      shape(shape),
-      _data(reinterpret_cast<T*>(ALLOCATE(size_bytes())))
-    {
-        Clear();
-    }
-
-    Tensor4D(const Tensor4D<ShapeT, T>&) = delete;
-    Tensor4D operator=(const Tensor4D<ShapeT, T>&) = delete;
-
-    ~Tensor4D() {
-      free(_data);
-    }
-
-    inline T* ptr() const {
-      return _data;
-    }
-
-    inline size_t size_bytes() const {
-      return shape.size() * sizeof(T);
-    }
-
-    /* Extract an element of the tensor.
-     *
-     * If the shape is a Tensor4DShape then the index is given as batch, row,
-     * column and channel.  If the shape is a KernelShape then the index is
-     * given as output channel, row, column and input channel.
-     */
-    inline T& element(const int a, const int b, const int c, const int d) const
-    {
-      return _data[shape.index(a, b, c, d)];
-    }
-
-    inline void Clear() {
-      Fill(static_cast<T>(0));
-    }
-
-    inline void Fill(T val) {
-      for (int i = 0; i < shape.size(); i++)
-        _data[i] = val;
-    }
-
-    const ShapeT shape;
-
-  private:
-    T* const _data;
-};
diff --git a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
deleted file mode 100644
index 0c234431b1..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/tensor_utils.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "tensor.hpp"
-
-// Methods to print tensors and weights
-void PrintTensor(const Tensor4D<Tensor4DShape, float>& tensor);
-void PrintWeights(const Tensor4D<KernelShape, float>& weights);
-
-// Test the equivalence of two tensors
-// Counts the instances that |a - b|/|a| > max_err
-bool CmpTensors(
-  const Tensor4D<Tensor4DShape, float>& a,
-  const Tensor4D<Tensor4DShape, float>& b,
-  const float max_err=0.0f
-);
-
-// Fill the tensor with a test pattern
-void TestPattern(Tensor4D<Tensor4DShape, float>& tensor);
-void TestPattern(Tensor4D<KernelShape, float>& weights);
-
-// Fill the tensor with random values
-void Randomise(Tensor4D<Tensor4DShape, float>& tensor, const int seed=0);
-void Randomise(Tensor4D<KernelShape, float>& weights, const int seed=0);
diff --git a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp b/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
deleted file mode 100644
index 99b2282f7e..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/common/utils.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <limits>
-
-void PrintMatrix(const float *const m, const int M, const int N, const int row_stride);
-
-constexpr inline int iceildiv(const int a, const int b)
-{
-    return (a + b - 1) / b;
-}
-
-template <typename T>
-inline T roundup(const T a, const T b)
-{
-    return b * iceildiv(a, b);
-}
-
-template<typename T>
-struct TypeBounds
-{
-    static constexpr T lower() noexcept { return std::numeric_limits<T>::has_infinity
-                                                 ? -std::numeric_limits<T>::infinity()
-                                                 : std::numeric_limits<T>::lowest(); };
-    static constexpr T upper() noexcept { return std::numeric_limits<T>::has_infinity
-                                                 ? std::numeric_limits<T>::infinity()
-                                                 : std::numeric_limits<T>::max(); };
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template<>
-struct TypeBounds<__fp16>
-{
-    static constexpr __fp16 lower() noexcept { return -std::numeric_limits<float>::infinity(); };
-    static constexpr __fp16 upper() noexcept { return std::numeric_limits<float>::infinity(); }
-};
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
deleted file mode 100644
index a4a833d90a..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise.hpp
+++ /dev/null
@@ -1,551 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <arm_neon.h>
-#include "activation.hpp"
-#include "padding.hpp"
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-class IDepthwiseConvolution
-{
-  public:
-    virtual ~IDepthwiseConvolution() = default;
-
-    virtual int output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after
-    ) const = 0;
-
-    /* Set input tensor and stride. */
-    virtual void set_input(const void *inptr) = 0;
-    virtual void set_input(const void *inptr, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int row_stride, int column_stride) = 0;
-    virtual void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Set output tensor and stride. */
-    virtual void set_output(void *outptr) = 0;
-    virtual void set_output(void *outptr, int column_stride) = 0;
-    virtual void set_output(void *outptr, int row_stride, int column_stride) = 0;
-    virtual void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) = 0;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    virtual size_t get_packed_params_size(void) const = 0;
-    virtual void set_packed_params_buffer(void *) = 0;
-
-    virtual void pack_params(const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const = 0;
-    virtual void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const = 0;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-    virtual void set_working_space(void *) = 0;
-
-    virtual unsigned int get_window(void) const = 0;
-    virtual void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) = 0;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut,
-  typename Derived
->
-class DepthwiseConvolutionBase : public IDepthwiseConvolution
-{
-  public:
-    // Information about the specific convolution instance
-    using InputType = TIn;
-    using BiasType = TBias;
-    using OutputType = TOut;
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int stride_rows = StrideRows;
-    static constexpr int stride_cols = StrideCols;
-    static constexpr int inner_tile_rows = stride_rows * (output_tile_rows - 1) + kernel_rows;
-    static constexpr int inner_tile_cols = stride_cols * (output_tile_cols - 1) + kernel_cols;
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new depthwise convolution engine.
-     *
-     * @param[in] n_batches Number of batches tensors.
-     * @param[in] n_input_rows Number of rows in input tensor.
-     * @param[in] n_input_cols Number of columns in input tensor.
-     * @param[in] n_channels Number of channels in input and output tensors.
-     */
-    DepthwiseConvolutionBase(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DepthwiseConvolution.
-    DepthwiseConvolutionBase(DepthwiseConvolutionBase&) = delete;
-    DepthwiseConvolutionBase operator=(DepthwiseConvolutionBase&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /** Get the number of output rows/columns.
-     *
-     * @param[in] dim_size Number of elements in the dimension (rows/columns)
-     * @param[in] same_padding True if the padding is SAME, otherwise false.
-     */
-    static int get_output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Determine how much memory is required to store the packed weights and
-     * biases.
-     */
-    size_t get_packed_params_size(void) const override;
-
-    /* Set the buffer for the packed weights and biases, and perform the
-     * packing.
-     */
-    void set_packed_params_buffer(void *buffer) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      const void *biases=nullptr
-    ) const override;
-
-    void pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /** Query the amount of working space required.
-     * @param[in] The largest number of threads which will be used to execute
-     *            the kernel.
-     */
-    size_t get_working_space_size(unsigned int n_threads=1) const override;
-
-    /** Set the working space buffer.
-     */
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work to be performed by an instance of the operator.
-     */
-    unsigned int get_window(void) const override;
-
-    /** Perform a portion of the work associated with the operator.
-     *
-     * Will perform the window of work described by $[start, stop)$.
-     *
-     * @param[in] start Start of the window of work to perform.
-     * @param[in] stop End of the work to perform.
-     * @param[in] ID of the thread performing the work.
-     */
-    void run(
-      unsigned int start,
-      unsigned int stop,
-      unsigned int threadid=0
-    ) override;
-
-  protected:
-    /** Get the value to use to pad the tensor.
-     */
-    TIn _input_padding_value(void) const;
-
-    /** Implementation of the parameter packing.
-     */
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    /** Process a tile-row of the tensors.
-     */
-    void process_tile_row(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int row_pad_in_top,
-      int row_pad_in_left,
-      int row_pad_in_bottom,
-      int row_pad_out_bottom,
-      int n_tiles,
-      int n_input_cols,
-      int n_output_cols
-    );
-
-    /** Process a single tile of the tensor.
-     *
-     * This method will apply input/output padding (if required) and call the
-     * depthwise tile implementation.
-     */
-    void process_tile(
-      unsigned int threadid,
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      OutputType* outptr,
-      int pad_in_top,
-      int pad_in_left,
-      int pad_in_bottom,
-      int pad_in_right,
-      int pad_out_bottom,
-      int pad_out_right
-    );
-
-    /** Perform depthwise convolution on a single tile.
-     */
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      OutputType* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[inner_tile_rows][inner_tile_cols],
-      OutputType* outptrs[output_tile_rows][output_tile_cols]
-    );
-
-    int n_channels(void) const;
-
-  private:
-    // Member variables of instances of a convolution engine.
-    const InputType* _input;
-    OutputType* _output;
-    void* _packed_parameters;
-    void* _working_space;  // Per-thread working space
-    const int _n_batches, _n_input_rows, _n_input_cols, _n_channels,
-              _n_output_rows, _n_output_cols, _n_tile_rows, _n_tile_cols;
-    const unsigned int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const nck::ActivationFunction _activation;
-
-    // Stride information for a convolution instance
-    int _input_col_stride, _input_row_stride, _input_batch_stride;
-    int _output_col_stride, _output_row_stride, _output_batch_stride;
-
-    // Methods for getting access to working space
-    size_t _get_input_working_space_size(void) const;
-    size_t _get_output_working_space_size(void) const;
-
-    void *_get_input_working_space(unsigned int threadid) const;
-    void *_get_output_working_space(unsigned int threadid) const;
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  TIn, TBias, TOut,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    TIn, TBias, TOut,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    using Base::DepthwiseConvolutionBase;
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const TIn* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      TOut* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const InputType* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      OutputType* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float, float, float,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float, float, float,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float, float, float
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class DepthwiseConvolution<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t
-> : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  float16_t, float16_t, float16_t,
-  DepthwiseConvolution<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t
-  >
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    float16_t, float16_t, float16_t,
-    DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      float16_t, float16_t, float16_t
-  > >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      float16_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const float16_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      float16_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-};
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
deleted file mode 100644
index e0d7f0c7f1..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include <deque>
-#include <functional>
-#include <memory>
-
-#include "depthwise.hpp"
-
-namespace depthwise
-{
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols,
-  typename TIn, typename TBias, typename TOut
->
-class DilatedDepthwiseConvolution : public IDepthwiseConvolution
-{
-  public:
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    /** Create a new dilated depthwise convolution engine.
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    // Cannot copy or move a DilatedDepthwiseConvolution.
-    DilatedDepthwiseConvolution(DilatedDepthwiseConvolution&) = delete;
-    DilatedDepthwiseConvolution operator=(DilatedDepthwiseConvolution&) = delete;
-
-    /* Set input tensor and stride. */
-    void set_input(const void *inptr) override;
-    void set_input(const void *inptr, int column_stride) override;
-    void set_input(const void *inptr, int row_stride, int column_stride) override;
-    void set_input(const void *inptr, int batch_stride, int row_stride, int column_stride) override;
-
-    /* Set output tensor and stride. */
-    void set_output(void *outptr) override;
-    void set_output(void *outptr, int column_stride) override;
-    void set_output(void *outptr, int row_stride, int column_stride) override;
-    void set_output(void *outptr, int batch_stride, int row_stride, int column_stride) override;
-
-    static int get_output_size(
-      int dim_size,
-      unsigned int padding_before,
-      unsigned int padding_after,
-      int dilation_factor
-    );
-
-    int output_size(
-      int dim_size, unsigned int padding_before, unsigned int padding_after
-    ) const override;
-
-    /* Weights and biases are re-ordered to improve memory access patterns. Use
-     * these methods to determine the size of the re-pack buffer and to set the
-     * address (and implicitly reorder the weights and biases into) the buffer.
-     */
-    size_t get_packed_params_size(void) const override;
-    void set_packed_params_buffer(void *) override;
-
-    void pack_params(const void *weights, const void *biases=nullptr) const override;
-    void pack_params(void *buffer, const void *weights, const void *biases=nullptr) const override;
-    void pack_params(
-      void *buffer,
-      const void* weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const override;
-
-    /* Working space is used to pad tensors on the fly. Before running any
-     * inference check the amount of space required, allocate and provide a
-     * pointer to the convolution engine.
-     */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *) override;
-
-    unsigned int get_window(void) const override;
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    /** Protected constructor which also accepts a function to construct a new
-     * subconvolution
-     */
-    DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right,
-      std::function<IDepthwiseConvolution *(int, int, int, int, int, int, nck::ActivationFunction, unsigned int, unsigned int, unsigned int, unsigned int)> subconvfn
-    );
-
-    const int _dilation_factor;
-    const int _n_input_rows, _n_input_cols, _n_channels;
-    const int _padding_top, _padding_left;
-    const int _n_output_rows, _n_output_cols;
-
-    /* Dilated depthwise convolution is performed through repeated calls to
-     * non-dilated convolutions. If the dilation factor is $n$, then we perform
-     * $(n + 1)^2$ depthwise convolutions.
-     */
-    using BaseDepthwise = DepthwiseConvolution<
-      OutputTileRows, OutputTileCols,
-      KernelRows, KernelCols,
-      StrideRows, StrideCols,
-      TIn, TBias, TOut
-    >;
-    std::deque<std::deque<std::unique_ptr<IDepthwiseConvolution>>> _convs;
-};
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
deleted file mode 100644
index 37c1f1bc84..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized.hpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise.hpp"
-#include "qasymm8.hpp"
-#include "qsymm8.hpp"
-#pragma once
-
-using namespace neon_convolution_kernels;
-using namespace qasymm8;
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32x4_t& b)
-{
-  return vqrdmulhq_s32(a, b);
-}
-
-inline int32x4_t saturating_doubling_high_mul(const int32x4_t& a, const int32_t& b)
-{
-  return vqrdmulhq_n_s32(a, b);
-}
-
-inline int32_t saturating_doubling_high_mul(const int32_t& a, const int32_t& b)
-{
-  return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int32x4_t shift)
-{
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x4_t rounding_divide_by_exp2(const int32x4_t& x, const int exponent)
-{
-  const int32x4_t shift = vdupq_n_s32(-exponent);
-  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
-  const int32x4_t fixed = vqaddq_s32(x, fixup);
-  return vrshlq_s32(fixed, shift);
-}
-
-inline int32x2_t rounding_divide_by_exp2(const int32x2_t& x, const int exponent)
-{
-  const int32x2_t shift = vdup_n_s32(-exponent);
-  const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
-  const int32x2_t fixed = vqadd_s32(x, fixup);
-  return vrshl_s32(fixed, shift);
-}
-
-inline int32_t rounding_divide_by_exp2(const int32_t& x, const int exponent)
-{
-  const int32x2_t xs = vdup_n_s32(x);
-  return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
-}
-
-namespace depthwise
-{
-
-namespace nck = neon_convolution_kernels;
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QAsymm8DepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QAsymm8DepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-    QAsymm8DepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qasymm8::QAsymm8Params _weights_quant, _inputs_quant, _output_quant;
-    const qasymm8::QAsymm8RescaleParams rescale_parameters;
-};
-
-template <
-  unsigned int OutputTileRows, unsigned int OutputTileCols,
-  unsigned int KernelRows, unsigned int KernelCols,
-  unsigned int StrideRows, unsigned int StrideCols
->
-class QSymm8HybridPerChannelDepthwiseConvolution : public DepthwiseConvolutionBase<
-  OutputTileRows, OutputTileCols,
-  KernelRows, KernelCols,
-  StrideRows, StrideCols,
-  uint8_t, int32_t, uint8_t,
-  QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
->
-{
-  using Base = DepthwiseConvolutionBase<
-    OutputTileRows, OutputTileCols,
-    KernelRows, KernelCols,
-    StrideRows, StrideCols,
-    uint8_t, int32_t, uint8_t,
-    QSymm8HybridPerChannelDepthwiseConvolution<OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows, StrideCols>
-  >;
-  friend Base;
-  using InputType = typename Base::InputType;
-  using OutputType = typename Base::OutputType;
-
-  public:
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  QSymm8HybridPerChannelDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      nck::ActivationFunction activation,
-      const qsymm8::QSymm8PerChannelParams& weight_quantisation,
-      const qasymm8::QAsymm8Params& input_quantisation,
-      const qasymm8::QAsymm8Params& output_quantisation,
-      const qsymm8::QSymm8PerChannelRescaleParams& rescale_parameters,
-      unsigned int padding_top,
-      unsigned int padding_left,
-      unsigned int padding_bottom,
-      unsigned int padding_right
-    );
-
-  size_t get_packed_params_size(void) const override
-  {
-      return this->n_channels() * (sizeof(int8_t)*KernelRows*KernelCols + 3*sizeof(int32_t));
-
-  }
-
-  protected:
-    uint8_t _input_padding_value(void) const;
-
-    void _pack_params(
-      void *buffer,
-      const void *weights,
-      unsigned int weight_row_stride,
-      unsigned int weight_col_stride,
-      const void *biases=nullptr
-    ) const;
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptr,
-      unsigned int in_row_stride,
-      unsigned int in_col_stride,
-      uint8_t* outptr,
-      unsigned int out_row_stride,
-      unsigned int out_col_stride
-    );
-
-    template <nck::ActivationFunction Activation>
-    void execute_tile(
-      int n_channels,
-      const void* packed_params,
-      const uint8_t* inptrs[Base::inner_tile_rows][Base::inner_tile_cols],
-      uint8_t* outptrs[Base::output_tile_rows][Base::output_tile_cols]
-    );
-
-  private:
-    // Quantization parameters
-    const qsymm8::QSymm8PerChannelParams _weights_quant;
-    const qasymm8::QAsymm8Params _input_quant, _output_quant;
-    const qsymm8::QSymm8PerChannelRescaleParams _rescale_parameters;
-};
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp b/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
deleted file mode 100644
index cf1c6f581f..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "depthwise_dilated.hpp"
-#include "depthwise_quantized.hpp"
-
-namespace depthwise {
-
-template <unsigned int OutputTileRows, unsigned int OutputTileCols,
-          unsigned int KernelRows, unsigned int KernelCols,
-          unsigned int StrideRows, unsigned int StrideCols>
-class QAsymm8DilatedDepthwiseConvolution
-    : public DilatedDepthwiseConvolution<
-          OutputTileRows, OutputTileCols, KernelRows, KernelCols, StrideRows,
-          StrideCols, uint8_t, int32_t, uint8_t> {
-public:
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams &rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-
-  /** Create a new dilated depthwise convolution engine.
-   */
-  QAsymm8DilatedDepthwiseConvolution(
-      int n_batches, int n_input_rows, int n_input_cols, int n_channels,
-      int dilation_factor, int n_output_rows, int n_output_cols,
-      nck::ActivationFunction activation,
-      const qasymm8::QAsymm8Params &weight_quantisation,
-      const qasymm8::QAsymm8Params &input_quantisation,
-      const qasymm8::QAsymm8Params &output_quantisation,
-      const qasymm8::QAsymm8RescaleParams& rescale_parameters,
-      unsigned int padding_top, unsigned int padding_left,
-      unsigned int padding_bottom, unsigned int padding_right);
-};
-
-}  // namespace depthwise
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp
deleted file mode 100644
index bc0d9d4296..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd.hpp
+++ /dev/null
@@ -1,621 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-#include <cstddef>
-#include <utility>
-
-namespace winograd
-{
-
-class ITransform
-{
-  public:
-    virtual ~ITransform() = default;
-
-    /**
-     * Get the working space required to perform the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param nthreads The greatest number of threads that will be used to execute the transform.
-     * @return Size of working space required in bytes.
-     */
-    virtual size_t get_working_space_size(unsigned int nthreads=1) const = 0;
-
-    /**
-     * Set the working space to be used by the transformation.
-     *
-     * Note, the working space is only required when performing the
-     * transformation - hence it can be reused whenever the transformation is
-     * not running.
-     *
-     * @param Pointer to the working space.
-     */
-    virtual void set_working_space(void *buffer) = 0;
-
-    /**
-     * Get the window of work a given operator can perform.
-     */
-    virtual unsigned int get_window() const = 0;
-
-    /**
-     * Perform work upon a window of the transform.
-     */
-    virtual void run(unsigned int start, unsigned int stop, unsigned int threadid=0) = 0;
-};
-
-class IInputTransform : public ITransform
-{
-  public:
-    virtual ~IInputTransform() = default;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     */
-    virtual void set_input_tensor(const void *input) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int col_stride) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set the pointer to the (NHWC-ordered) tensor to be transformed.
-     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the transformed input.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-class IOutputTransform : public ITransform
-{
-  public:
-    virtual ~IOutputTransform() = default;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the input to the transform.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_input_matrices(const void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-
-    /**
-     * Set pointer to the bias tensor (can be ignored or called with nullptr for no bias.
-     */
-    virtual void set_bias(const void *bias=nullptr) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     */
-    virtual void set_output_tensor(void *output) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int col_stride) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int row_stride, int col_stride) = 0;
-
-    /**
-     * Set pointer to the output tensor produced by the transform.
-     * @param batch_stride Stride between batches of the tensor, measured in elements (not bytes).
-     * @param row_stride Stride between rows of the tensor, measured in elements (not bytes).
-     * @param col_stride Stride between columns of the tensor, measured in elements (not bytes).
-     */
-    virtual void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) = 0;
-};
-
-class IWeightTransform : public ITransform
-{
-  public:
-    virtual ~IWeightTransform() = default;
-
-    /** Set pointer to the weight tensor read by the transform. */
-    virtual void set_weight_tensor(const void *weights) = 0;
-
-    /**
-     * Set pointers to the matrices written by the transform.
-     * @param matrices Pointer to the start of the first matrix representing the transformed input.
-     * @param inter_matrix_stride Stride (in elements) between matrices.
-     * @param matrix_row_stride Stride (in elements) between the rows within a single matrix.
-     */
-    virtual void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) = 0;
-};
-
-enum class WinogradRoots
-{
-  Integers,
-};
-
-template <int InnerTileRows, int InnerTileCols, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform : public IInputTransform
-{
-  public:
-    /** Create an InputTransform operator fixed on a given problem and set of
-     * pointers.
-     */
-    InputTransform(
-        int kernel_rows,     /**< Number of rows in the kernel */
-        int kernel_cols,     /**< Number of columns in the kernel */
-        int n_batches,       /**< Number of batches in input tensor. */
-        int n_rows,          /**< Number of rows in input tensor. */
-        int n_cols,          /**< Number of columns in input tensor. */
-        int n_channels,      /**< Number of channels in input tensor. */
-        int padding_top,     /**< Padding to apply to the top of the image. */
-        int padding_left,    /**< Padding to apply to the left of the image. */
-        int padding_bottom,  /**< Padding to apply to the bottom of the image. */
-        int padding_right    /**< Padding to apply to the right of the image. */
-    );
-
-    InputTransform(InputTransform&) = delete;
-    InputTransform operator=(InputTransform&) = delete;
-
-    /** Set pointers to the input tensor read by the transform. */
-    void set_input_tensor(const void *input) override;
-    void set_input_tensor(const void *input, int col_stride) override;
-    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
-    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-
-    /** Set pointers to the matrices written by the transform. */
-    void set_output_matrices(void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    const int _n_batches, _n_rows, _n_cols, _n_channels;
-
-  private:
-    void transform_unpadded_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr
-    );
-
-    void transform_padded_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      int padding_top,
-      int padding_left,
-      int padding_bottom,
-      int padding_right
-    );
-    
-    /* Tile implementation */
-    static void transform_tile(
-      int n_channels,         /** @param[in] Number of channels in the tensor. */
-      const TIn* inptr_base,  /** @param[in] Pointer to the base of the input tile. */
-      int input_row_stride,   /** @param[in] Stride between rows of the input tensor. */
-      int input_col_stride,   /** @param[in] Stride between columns of the input tensor. */
-      TOut* mptr_base,        /** @param[out] Base pointer to transformed input matrices. */
-      int matrix_stride       /** @param[in] Stride between matrices in the input space. */
-    );
-
-    /** Get the working space for a thread. */
-    void * get_working_space(unsigned int threadid) const;
-
-    const TIn* _inptr;
-    TOut* _outptr;
-
-    const int _overlap_rows, _overlap_cols;
-    const int _padding_top, _padding_left, _padding_bottom, _padding_right;
-    const int _tiles_M, _tiles_N;
-    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
-    int _in_col_stride, _in_row_stride, _in_batch_stride;
-
-    const int _working_space_col_stride, _working_space_row_stride;
-    TIn *_working_space;
-};
-
-template <int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class InputTransform<InnerTileRows, 1, TIn, TOut, Roots> :
-  public InputTransform<1, InnerTileRows, TIn, TOut, Roots>
-{
-  using Base = InputTransform<1, InnerTileRows, TIn, TOut, Roots>;
-
-  public:
-    InputTransform(
-      int kernel_rows,     /**< Number of rows in the kernel. */
-      int kernel_cols,     /**< Number of columns in the kernel. */
-      int n_batches,       /**< Number of batches in input tensor. */
-      int n_rows,          /**< Number of rows in input tensor. */
-      int n_cols,          /**< Number of columns in input tensor. */
-      int n_channels,      /**< Number of channels in input tensor. */
-      int padding_top,     /**< Padding to apply to the top of the image. */
-      int padding_left,    /**< Padding to apply to the left of the image. */
-      int padding_bottom,  /**< Padding to apply to the bottom of the image. */
-      int padding_right    /**< Padding to apply to the right of the image. */
-    );
-
-    /** Set pointers to the input tensor read by the transform. */
-    void set_input_tensor(const void *input) override;
-    void set_input_tensor(const void *input, int col_stride) override;
-    void set_input_tensor(const void *input, int row_stride, int col_stride) override;
-    void set_input_tensor(const void *input, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
-  int KernelRows, int KernelCols,
-  int InnerTileRows, int InnerTileCols,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class OutputTransform : public IOutputTransform
-{
-  public:
-    OutputTransform(
-      int n_batches,  /**< Number of batches in output tensor. */
-      int n_rows,     /**< Number of rows in output tensor. */
-      int n_cols,     /**< Number of columns in output tensor. */
-      int n_channels, /**< Number of channels in output tensor. */
-      const arm_gemm::Activation &activation
-    );
-
-    OutputTransform(OutputTransform&) = delete;
-    OutputTransform operator=(OutputTransform&) = delete;
-
-    /** Set pointers to the matrices read by the transform. */
-    void set_input_matrices(const void *matrices, int iter_matrix_stride, int matrix_row_stride) override;
-
-    /** Set pointer to the bias tensor (can be ignored or called with nullptr for no bias */
-    void set_bias(const void *bias=nullptr) override;
-
-    /** Set pointers to the output tensor written by the transform. */
-    void set_output_tensor(void *output) override;
-    void set_output_tensor(void *output, int col_stride) override;
-    void set_output_tensor(void *output, int row_stride, int col_stride) override;
-    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    static constexpr int inner_tile_rows = InnerTileRows;
-    static constexpr int inner_tile_cols = InnerTileCols;
-    static constexpr int output_tile_rows = InnerTileRows - KernelRows + 1;
-    static constexpr int output_tile_cols = InnerTileCols - KernelCols + 1;
-
-    const int _n_batches, _n_rows, _n_cols, _n_channels;
-    const TOut _output_min, _output_max;
-
-  private:
-    void transform_uncropped_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      const TOut *biases
-    );
-
-    void transform_cropped_tile(
-      unsigned int threadid,
-      int n_channels,
-      TOut *outptr,
-      const TIn *inptr,
-      const TOut *biases,
-      int pad_bottom,
-      int pad_right
-    );
-
-    /** Implementation of the tile transformation method. */
-    static void transform_tile(
-      int n_channels,
-      const TIn* matrix_base,
-      int matrix_stride,
-      const TOut* biases,
-      TOut* output,
-      int output_row_stride,
-      int output_col_stride,
-      TOut output_min,
-      TOut output_max
-    );
-
-    /** Get the working space for a thread. */
-    void * get_working_space(unsigned int threadid) const;
-
-    const TIn* _matrix_base;
-    const TOut* _biases;
-    int _matrix_stride, _matrix_row_stride, _matrix_batch_stride;
-    TOut* _outptr;
-    const int _tiles_M, _tiles_N;
-    int _out_col_stride, _out_row_stride, _out_batch_stride;
-
-    const int _working_space_col_stride, _working_space_row_stride;
-    TOut *_working_space;
-};
-
-template <
-  int KernelRows,
-  int InnerTileRows,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class OutputTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
-  public OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
-  using Base = OutputTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>;
-
-  public:
-    OutputTransform(
-      int n_batches,  /**< Number of batches in output tensor. */
-      int n_rows,     /**< Number of rows in output tensor. */
-      int n_cols,     /**< Number of columns in output tensor. */
-      int n_channels, /**< Number of channels in output tensor. */
-      const arm_gemm::Activation &activation
-    );
-
-    /** Set pointers to the output tensor written by the transform. */
-    void set_output_tensor(void *output) override;
-    void set_output_tensor(void *output, int col_stride) override;
-    void set_output_tensor(void *output, int row_stride, int col_stride) override;
-    void set_output_tensor(void *output, int batch_stride, int row_stride, int col_stride) override;
-};
-
-template <
-  int KernelRows, int KernelCols,
-  int InnerTileRows, int InnerTileCols,
-  typename TIn, typename TOut,
-  WinogradRoots Roots
->
-class WeightTransform : public IWeightTransform
-{
-  public:
-    WeightTransform(
-      int n_output_channels,  /**< Number of output channels in the kernel. */
-      int n_input_channels    /**< Number of input channels in the kernel. */
-    );
-
-    WeightTransform(WeightTransform&) = delete;
-    WeightTransform operator=(WeightTransform&) = delete;
-
-    /** Set pointer to the weight tensor read by the transform. */
-    void set_weight_tensor(const void *weights) override;
-
-    /** Set pointer to the matrices written by the transform. */
-    void set_output_matrices(void *matrices, int inter_matrix_stride, int matrix_row_stride) override;
-
-    /** Get the working space required to perform the transformation. */
-    size_t get_working_space_size(unsigned int nthreads=1) const override;
-    void set_working_space(void *buffer) override;
-
-    /** Get the window of work a given operator can perform. */
-    unsigned int get_window() const override;
-    static constexpr unsigned int WINDOW_BLOCK = 16;  // Base size of window
-
-    /** Perform work upon a window of the input. */
-    void run(unsigned int start, unsigned int stop, unsigned int threadid=0) override;
-
-  protected:
-    static const int kernel_rows = KernelRows;
-    static const int kernel_cols = KernelCols;
-    static const int inner_tile_rows = InnerTileRows;
-    static const int inner_tile_cols = InnerTileCols;
-
-  private:
-    /** Apply the transform to a tensor. */
-    static void execute(
-      int n_output_channels,
-      int n_input_channels,
-      const TIn* input,
-      TOut* output,
-      int matrix_stride,
-      int matrix_row_stride
-    );
-
-    const int _n_output_channels, _n_input_channels;
-    TOut *_matrices;
-    int _matrix_stride, _matrix_row_stride;
-    const TIn *_weights;
-};
-
-template <int KernelRows, int InnerTileRows, typename TIn, typename TOut, WinogradRoots Roots>
-class WeightTransform<KernelRows, 1, InnerTileRows, 1, TIn, TOut, Roots> :
-  public WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>
-{
-  public:
-    using WeightTransform<1, KernelRows, 1, InnerTileRows, TIn, TOut, Roots>::WeightTransform;
-};
-
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols, WinogradRoots Roots>
-class WinogradGEMM
-{
-  public:
-    // Information about the specific Winograd instance
-    static constexpr int output_tile_rows = OutputTileRows;
-    static constexpr int output_tile_cols = OutputTileCols;
-    static constexpr int kernel_rows = KernelRows;
-    static constexpr int kernel_cols = KernelCols;
-    static constexpr int inner_tile_rows = output_tile_rows + kernel_rows - 1;
-    static constexpr int inner_tile_cols = output_tile_cols + kernel_cols - 1;
-    static constexpr int N_GEMMS = inner_tile_rows * inner_tile_cols;
-
-    /** Transform weights from the spatial to the Winograd domain. */
-    template <typename TIn, typename TOut>
-    using WeightsTransform = WeightTransform<
-      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
-      TIn, TOut, Roots
-    >;
-
-    /** Transform input feature maps from the spatial to the Winograd domain.
-     */
-    template <typename TIn, typename TOut>
-    using InputTransform = InputTransform<
-      inner_tile_rows, inner_tile_cols, TIn, TOut, Roots
-    >;
-
-    /** Transform output feature maps from the Winograd to the spatial domain.
-     */
-    template <typename TIn, typename TOut>
-    using OutputTransform = OutputTransform<
-      KernelRows, KernelCols, inner_tile_rows, inner_tile_cols,
-      TIn, TOut, Roots
-    >;
-
-    /** Perform a convolution.
-     */
-    template <typename TOut, typename TIn, typename TInGEMM=TIn, typename TOutGEMM=TOut>
-    class Convolution
-    {
-      public:
-        // Information about the typed Winograd instance
-        typedef TOut OutputType;
-        typedef TOutGEMM GemmOutputType;
-        typedef TInGEMM GemmInputType;
-        typedef TIn InputType;
-
-        /** Get the output shape of a convolution. */
-        static std::pair<unsigned int, unsigned int> get_output_shape(
-            const std::pair<unsigned int, unsigned int> input_shape,
-            bool padding_same);
-
-        /** Get the memory required to store the kernel transformed into the
-         * Winograd domain.
-         */
-        static size_t get_kernel_storage_size(unsigned int n_input_channels,
-                                              unsigned int n_output_channels);
-
-        /** Get the memory required to store the input tensor transformed into
-         * the Winograd domain.
-         */
-        static size_t get_input_storage_size(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        /** Get the memory required to store the output tensor in the Winograd
-         * domain.
-         */
-        static size_t get_output_storage_size(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        /** Get the memory required to apply a Winograd operator to some input.
-         */
-        static size_t get_working_space_size(
-            unsigned int n_batches,
-            unsigned int n_rows,            // Number of input rows
-            unsigned int n_cols,            // Number of input columns
-            unsigned int n_input_channels,  // Number of input channels
-            unsigned int n_output_channels, // Number of output channels
-            bool padding_same);
-
-        /* Get the memory required by a single "input" matrix.
-         */
-        static size_t get_input_matrix_size(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        static int get_input_matrix_stride(
-            unsigned int n_batches,  // Number of batches
-            unsigned int n_rows,     // Number of input rows
-            unsigned int n_cols,     // Number of input columns
-            unsigned int n_channels, // Number of input channels
-            bool padding_same);
-
-        /* Get the memory required by a single "output" matrix.
-         */
-        static size_t get_output_matrix_size(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        static int get_output_matrix_stride(
-            unsigned int n_batches, // Number of batches
-            unsigned int n_rows,    // Number of output rows
-            unsigned int n_cols,    // Number of output columns
-            unsigned int n_channels // Number of output channels
-            );
-
-        /* Get the memory required by a single "kernel" matrix.
-         */
-        static size_t get_kernel_matrix_size(unsigned int n_input_channels,
-                                             unsigned int n_output_channels);
-        static int get_kernel_matrix_stride(unsigned int n_input_channels,
-                                            unsigned int n_output_channels);
-
-        static constexpr int M_BLOCK = 4;   /** Size of block used by GEMM. */
-        static constexpr int N_BLOCK = 16;  /** Size of block used by GEMM. */
-    };
-};
-
-}  // namespace winograd
diff --git a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp b/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
deleted file mode 100644
index ed8fede385..0000000000
--- a/arm_compute/core/NEON/kernels/convolution/winograd/winograd_layer.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#pragma once
-#include "arm_gemm_local.hpp"
-#include "arm_gemm.hpp"
-#include "winograd.hpp"
-
-namespace winograd
-{
-
-
-class IWinogradConvolutionLayer
-{
-  public:
-    virtual ~IWinogradConvolutionLayer() = default;
-
-    virtual unsigned int weight_transform_get_window(void) const = 0;
-    virtual void weight_transform_run(unsigned int start, unsigned int stop) = 0;
-
-    virtual IInputTransform& input_transform(void) = 0; // Expose the input transform
-    virtual IOutputTransform& output_transform(void) = 0;  // Expose the output transform
-    virtual arm_gemm::IGemmCommon *gemm(void) = 0;  // Expose the underlying GEMM
-};
-
-/** Example of how to construct an ACL-like interface.
- *
- * Use `get_weight_storage_size`, `get_input_storage_size` and
- * `get_output_storage_size` to allocate memory for the convolution engine.
- * Then create a `WinogradConvolutionLayer`.
- *
- * Initialise the weights using `weights_transform.run(...)`.
- *
- * For each inference:
- *   1. Transform the inputs to the Winograd domain using `input_transform.run(...)`
- *   2. Perform a number of GEMMs using `gemms.run(...)`
- *   3. Transform the output to the spatial domain using `output_transform.run(...)`
- */
-template <int OutputTileRows, int OutputTileCols, int KernelRows, int KernelCols,
-          typename TIn, typename TInGEMM, typename TOutGEMM, typename TOut,
-          WinogradRoots Roots>
-class WinogradConvolutionLayer : public IWinogradConvolutionLayer
-{
-  public:
-    using WinogradBase = winograd::WinogradGEMM<OutputTileRows, OutputTileCols, KernelRows, KernelCols, Roots>;
-    using WeightsTransform = typename WinogradBase::template WeightsTransform<TIn, TInGEMM>;
-    using InputTransform = typename WinogradBase::template InputTransform<TIn, TInGEMM>;
-    using WinogradConv = typename WinogradBase::template Convolution<TOut, TIn, TInGEMM, TOutGEMM>;
-    using OutputTransform = typename WinogradBase::template OutputTransform<TOutGEMM, TOut>;
-
-  private:
-    static constexpr int InnerTileRows = OutputTileRows + KernelRows - 1;
-    static constexpr int InnerTileCols = OutputTileCols + KernelCols - 1;
-    static constexpr int N_GEMMS = InnerTileRows * InnerTileCols;
-
-    const int _n_output_rows, _n_output_cols;
-    const int _kernel_matrix_stride, _kernel_matrix_row_stride;
-    const int _input_matrix_stride, _input_matrix_row_stride;
-    const int _output_matrix_stride, _output_matrix_row_stride;
-    const int _tile_rows, _tile_cols;
-    const int _m, _k, _n;
-
-    WeightsTransform weights_transform;  /** Operator to transform weights to Winograd domain. */
-    InputTransform _input_transform;      /** Operator to transform input to Winograd domain. */
-    const arm_gemm::GemmArgs gemm_args;
-    arm_gemm::UniqueGemmCommon<TInGEMM, TOutGEMM> gemms;    /** Operator to perform multiple GEMMs. */
-    OutputTransform _output_transform;    /** Operator to transform output from Winograd domain. */
-
-  public:
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed weights.
-     */
-    static unsigned int get_weight_storage_size(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    static unsigned int get_weight_stride(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    static unsigned int get_weight_multi_stride(
-      const int n_output_channels,  /** Number of output feature maps. */
-      const int n_input_channels    /** Number of input feature maps. */
-    );
-
-    /** Determine how much memory (in units of TIn) to allocate for the
-     * transformed input.
-     */
-    static unsigned int get_input_storage_size(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the row stride for the A matrix in the Winograd domain. */
-    static unsigned int get_input_stride(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the stride between A matrices in the Winograd domain. */
-    static unsigned int get_input_multi_stride(
-      const int n_batches,     /** Number of batches in the input tensor. */
-      const int n_channels,    /** Number of feature maps in the input tensor. */
-      const int n_rows,        /** Number of rows in each feature map. */
-      const int n_cols,        /** Number of columns in each feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Determine how much memory (in units of TOut) to allocate for the
-     * (Winograd domain) output.
-     */
-    static unsigned int get_output_storage_size(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    static unsigned int get_output_stride(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    static unsigned int get_output_multi_stride(
-      const int n_batches,          /** Number of batches in the output tensor. */
-      const int n_rows,             /** Number of rows in each feature map of the input tensor. */
-      const int n_cols,             /** Number of columns in each feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding       /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Get the shape (rows, cols) of a feature map of the output tensor. */
-    static std::pair<int, int> get_output_feature_map_shape(
-      const int n_input_rows,  /** Number of rows in the input feature map. */
-      const int n_input_cols,  /** Number of columns in the input feature map. */
-      const bool same_padding  /** Use "SAME" padding, otherwise use "VALID". */
-    );
-
-    /** Create a new Winograd convolution layer.
-     */
-    WinogradConvolutionLayer(
-      const arm_gemm::CPUInfo &cpuinfo,       /** Describes CPU properties. */
-      const int n_threads,          /** Maximum number of threads used to execute the convolution. */
-      const int n_batches,          /** Number of batches in the input and output tensors. */
-      const int n_input_channels,   /** Number of feature maps in a batch of the input tensor. */
-      const int n_input_rows,       /** Number of rows in a feature map of the input tensor. */
-      const int n_input_cols,       /** Number of columns in a feature map of the input tensor. */
-      const int n_output_channels,  /** Number of feature maps in the output tensor. */
-      const bool same_padding,      /** Use "SAME" padding, otherwise use "VALID". */
-      const arm_gemm::Activation &activation,
-      const TIn* const weights,     /** Pointer to weight tensor in spatial domain. Must be ordered as "Height x Rows x Input Feature Maps x Output Feature Maps. */
-      TInGEMM* const weights_storage,  /** Pointer to storage for weight tensor in the Winograd domain. Must be at least the size returned by `get_weight_storage_size`. */
-      const TIn* const input,       /** Pointer to NHWC ordered input tensor, in the spatial domain. */
-      TInGEMM* const winograd_input,    /** Pointer to working space for the input tensor in the Winograd domain. Must be at least the size returned by `get_input_storage_size`. */
-      const TOut* const biases,     /** Pointer to biases vector. Pass nullptr if no bias is provided. */
-      TOut* const output,           /** Pointer to NHWC ordered output tensor, in the spatial domain. */
-      TOutGEMM* const winograd_output,  /** Pointer to working space for the output tensor in the Winograd domain. Must be at least the size returned by `get_output_storage_size`. */
-      const bool pretranspose_B=true,         /** Hint that the B matrix can be pretransposed. */
-      arm_gemm::GemmConfig *gemm_cfg=nullptr  /** Pointer to GEMM configuration. */
-    );
-
-    /* Utility methods for interacting with the layer. */
-    unsigned int weight_transform_get_window(void) const;
-    void weight_transform_run(const unsigned int start, const unsigned int stop);
-
-    IInputTransform& input_transform(void);
-    IOutputTransform& output_transform(void);
-
-    /* Get a pointer to the GEMM underlying the Winograd transform. */
-    arm_gemm::IGemmCommon *gemm(void);
-};
-
-}
diff --git a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
deleted file mode 100644
index 4861559695..0000000000
--- a/arm_compute/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
-#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
-
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace detail
-{
-/** Dummy activation object */
-template <typename T, int S>
-struct dummy
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-
-    /** Construct a dummy activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit dummy(ActivationLayerInfo act_info)
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        ARM_COMPUTE_UNUSED(vval);
-    }
-};
-/** Linear activation object */
-template <typename T, int S>
-struct linear
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a Linear activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit linear(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
-          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
-    {
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmla(vval, valpha, vbeta);
-    }
-
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
-};
-/** Square activation object */
-template <typename T, int S>
-struct square
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a Square activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit square(ActivationLayerInfo act_info)
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmul(vval, vval);
-    }
-};
-/** Logistic activation object */
-template <typename T, int S>
-struct logistic
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a Logistic activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{}))
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval))));
-    }
-
-    /** Vector of ones. */
-    const ExactType vone;
-};
-/** RELU activation object */
-template <typename T, int S>
-struct relu
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a RELU activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{}))
-    {
-        ARM_COMPUTE_UNUSED(act_info);
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmax(vzero, vval);
-    }
-
-    /** Vector of zeroes. */
-    const ExactType vzero;
-};
-/** Bounded RELU activation object */
-template <typename T, int S>
-struct brelu
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a bounded RELU activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit brelu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{})),
-          valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{}))
-    {
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval));
-    }
-
-    /** Vector of zeroes. */
-    const ExactType vzero;
-    /** Vector of alphas. */
-    const ExactType valpha;
-};
-/** Lower-Upper Bounded RELU activation object */
-template <typename T, int S>
-struct lubrelu
-{
-    /** NEON vector type. */
-    using ExactType = typename wrapper::traits::neon_vector<T, S>::type;
-    /** NEON vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-
-    /** Construct a lower-upper bounded RELU activation object.
-     *
-     * @param[in] act_info Activation layer information.
-     */
-    explicit lubrelu(ActivationLayerInfo act_info)
-        : valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})),
-          vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{}))
-    {
-    }
-
-    /** Run activation function.
-     *
-     * @param[in] vval Vector of values.
-     */
-    void operator()(ExactType &vval)
-    {
-        vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval));
-    }
-
-    /** Vector of alphas. */
-    const ExactType valpha;
-    /** Vector of betas. */
-    const ExactType vbeta;
-};
-} // namespace detail
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H */
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
deleted file mode 100644
index d756a9a98f..0000000000
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
-#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace detail
-{
-inline float32x4x3_t load_matrix_row(const float *ptr)
-{
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
-    return r;
-}
-
-template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
-
-template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
-{
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
-
-    out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
-
-    out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
-    out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
-{
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    return out;
-}
-
-template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
-{
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
-    out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    return out;
-}
-
-template <unsigned int stridex>
-void store_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-void store_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-    vst1q_f32(buffer + 4, values.val[1]);
-}
-
-template <>
-void store_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-}
-
-template <>
-void store_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vget_low_f32(values.val[0]));
-}
-
-template <unsigned int stridex>
-int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);
-
-template <>
-int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration;
-}
-
-template <>
-int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration << 1;
-}
-
-template <>
-int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)
-{
-    return num_elems_written_per_iteration * 3;
-}
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
-\ No newline at end of file
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
deleted file mode 100644
index d4cbc7f4af..0000000000
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ /dev/null
@@ -1,965 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
-#define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace detail
-{
-/** Loads a 3x3 matrix as a row  (float).
- *
- * @param[in] ptr            Pointer to a float 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(weights_offset);
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
-    return r;
-}
-
-/** Loads a 3x3 matrix as a row (uint8_t/int8_t).
- *
- * @param[in] ptr            Pointer to a uint8_t/int8_t 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
-{
-    const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
-
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    int32x4x3_t r =
-    {
-        {
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
-        }
-    };
-    return r;
-}
-
-/** Stores a float32x4x2_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-inline void store_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-    vst1q_f32(buffer + 4, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vget_low_f32(values.val[0]));
-}
-
-/** Stores a uint32_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(int32_t *buffer, const int32x4x2_t &values);
-
-template <>
-inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, values.val[0]);
-    vst1q_s32(buffer + 4, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1_s32(buffer, vget_low_s32(values.val[0]));
-}
-
-template <unsigned int stridex>
-inline void accumulate_results(float *buffer, const float32x4x2_t &values);
-
-template <>
-inline void accumulate_results<1>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
-    vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
-}
-
-template <>
-inline void accumulate_results<2>(float *buffer, const float32x4x2_t &values)
-{
-    vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
-}
-
-template <>
-inline void accumulate_results<3>(float *buffer, const float32x4x2_t &values)
-{
-    vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
-}
-
-template <unsigned int stridex>
-void accumulate_results(int32_t *buffer, const int32x4x2_t &values);
-
-template <>
-inline void accumulate_results<1>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
-    vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
-}
-
-template <>
-inline void accumulate_results<2>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
-}
-
-template <>
-inline void accumulate_results<3>(int32_t *buffer, const int32x4x2_t &values)
-{
-    vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Stores a float16x8x2_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(float16_t *buffer, const float16x8x2_t &values);
-
-template <>
-inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, values.val[0]);
-    vst1q_f16(buffer + 8, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1_f16(buffer, vget_low_f16(values.val[0]));
-}
-
-template <unsigned int stridex>
-inline void accumulate_results(float16_t *buffer, const float16x8x2_t &values);
-
-template <>
-inline void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
-    vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));
-}
-
-template <>
-inline void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));
-}
-
-template <>
-inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)
-{
-    vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                                const size_t dilation_x, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + dilation_x),
-            vld1q_f32(in_top + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + dilation_x),
-            vld1q_f32(in_mid + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + dilation_x),
-            vld1q_f32(in_low + 2 * dilation_x)
-        }
-    };
-    float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
-    out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
-    out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
-
-    out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
-    out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
-    out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
-
-    out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
-    out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
-    out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
-
-    return out;
-}
-
-/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] stridex      Stride value in elements across x.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-    float32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
-    {
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-    }
-
-    return out;
-}
-
-/** Perform a convolve3x3 on float32.
- *
- * @param[in]  in_top       Pointer to the first row of the input.
- * @param[in]  in_mid       Pointer to the second row of the input.
- * @param[in]  in_low       Pointer to the third row of the input.
- * @param[out] out_ptr      Pointer to the output.
- * @param[in]  m0           First row of the filter.
- * @param[in]  m1           Second row of the filter.
- * @param[in]  m2           Third row of the filter.
- * @param[in]  stridex      Stride value in elements across x.
- * @param[in]  input_offset (Optional) Input quantization offset.
- *
- */
-template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                  unsigned int stridex, int input_offset = 0);
-
-template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                         unsigned int stridex, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-
-    float32x4x2_t out =
-    {
-        {
-            vdupq_n_f32(0.f),
-            vdupq_n_f32(0.f)
-        }
-    };
-    if(stridex == 2)
-    {
-        const float32x4x2_t vtop     = vld2q_f32(in_top);
-        const float32x4x2_t vmid     = vld2q_f32(in_mid);
-        const float32x4x2_t vlow     = vld2q_f32(in_low);
-        const float32x4_t   vtop_end = vld1q_f32(in_top + 8);
-        const float32x4_t   vmid_end = vld1q_f32(in_mid + 8);
-        const float32x4_t   vlow_end = vld1q_f32(in_low + 8);
-
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
-
-        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
-    }
-    else
-    {
-        const float32x4x3_t vtop =
-        {
-            {
-                vld1q_f32(in_top),
-                vld1q_f32(in_top + 4),
-                vld1q_f32(in_top + 8)
-            }
-        };
-        const float32x4x3_t vmid =
-        {
-            {
-                vld1q_f32(in_mid),
-                vld1q_f32(in_mid + 4),
-                vld1q_f32(in_mid + 8)
-            }
-        };
-        const float32x4x3_t vlow =
-        {
-            {
-                vld1q_f32(in_low),
-                vld1q_f32(in_low + 4),
-                vld1q_f32(in_low + 8)
-            }
-        };
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
-
-        out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
-        out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
-
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
-
-        out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
-
-        out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
-        out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
-
-        if(stridex == 3)
-        {
-            out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
-            accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
-        }
-        else
-        {
-            accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
-        }
-    }
-}
-
-/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] input_offset Input quantization offset.
- *
- */
-template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
-                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                              size_t dilation_x, int32_t input_offset)
-{
-    using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
-    using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
-
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + dilation_x),
-            wrapper::vload(in_top + 2 * dilation_x)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + dilation_x),
-            wrapper::vload(in_mid + 2 * dilation_x)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + dilation_x),
-            wrapper::vload(in_low + 2 * dilation_x)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
-        }
-    };
-
-    int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
-    out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
-    out           = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
-
-    out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
-    out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
-    out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
-
-    out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
-    out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
-    out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
-
-    return out;
-}
-
-/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] stridex      Stride value in elements across x.
- * @param[in] input_offset Input quantization offset.
- *
- */
-template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                         const size_t dilation_x, unsigned int stridex, int input_offset)
-{
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-    int32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
-    }
-    return out;
-}
-
-/** Perform a convolve3x3 on 8-bit elements
- *
- * @param[in]  in_top       Pointer to the first row of the input.
- * @param[in]  in_mid       Pointer to the second row of the input.
- * @param[in]  in_low       Pointer to the third row of the input.
- * @param[out] out_ptr      Pointer to the output.
- * @param[in]  m0           First row of the filter.
- * @param[in]  m1           Second row of the filter.
- * @param[in]  m2           Third row of the filter.
- * @param[in]  stridex      Stride value in elements across x.
- * @param[in]  input_offset Input quantization offset.
- *
- */
-template < bool accumulate, typename T1, typename T2, REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
-                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                  unsigned int stridex, int32_t input_offset)
-{
-    ARM_COMPUTE_ERROR_ON(stridex > 3);
-    using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
-    using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
-
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + 8)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + 8)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + 8)
-        }
-    };
-
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-        }
-    };
-
-    int32x4x2_t out
-    {
-        {
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-        }
-    };
-
-    // 0
-    out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
-
-    out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
-
-    out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
-    out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
-
-    // 1
-    out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
-
-    out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
-
-    out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
-    out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
-
-    if(stridex == 1)
-    {
-        accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
-    }
-    else if(stridex == 2)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
-
-        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
-        accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
-    }
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Loads a 3x3 matrix as a row (float16_t).
- *
- * @param[in] ptr Pointer to a float 3x3 matrix.
- *
- * @return The loaded matrix.
- */
-inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(weights_offset);
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
-    return r;
-}
-
-/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] input_offset (Optional)Input quantization offset.
- *
- */
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                                const size_t dilation_x, int input_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + dilation_x),
-            vld1q_f16(in_top + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + dilation_x),
-            vld1q_f16(in_mid + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + dilation_x),
-            vld1q_f16(in_low + 2 * dilation_x)
-        }
-    };
-    float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
-    out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
-    out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
-
-    out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0]));
-    out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1]));
-    out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2]));
-
-    out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0]));
-    out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1]));
-    out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2]));
-
-    return out;
-}
-
-/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.
- *
- * @param[in] in_top       Pointer to the first row of the input.
- * @param[in] in_mid       Pointer to the second row of the input.
- * @param[in] in_low       Pointer to the third row of the input.
- * @param[in] m0           First row of the filter.
- * @param[in] m1           Second row of the filter.
- * @param[in] m2           Third row of the filter.
- * @param[in] dilation_x   Dilation, in elements across x.
- * @param[in] stridex      Stride value in elements across x.
- * @param[in] input_offset (Optional) Input quantization offset.
- *
- */
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
-{
-    float16x8x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
-
-    if(stridex == 2)
-    {
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
-    }
-    else if(stridex == 3)
-    {
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
-        out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
-    }
-
-    return out;
-}
-
-/** Perform a convolve3x3 on float16.
- *
- * @param[in]  in_top       Pointer to the first row of the input.
- * @param[in]  in_mid       Pointer to the second row of the input.
- * @param[in]  in_low       Pointer to the third row of the input.
- * @param[out] out_ptr      Pointer to the output.
- * @param[in]  m0           First row of the filter.
- * @param[in]  m1           Second row of the filter.
- * @param[in]  m2           Third row of the filter.
- * @param[in]  stridex      Stride value in elements across x.
- * @param[in]  input_offset (Optional) Input quantization offset.
- *
- */
-template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
-                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                         unsigned int stridex, int input_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    float16x8x2_t out =
-    {
-        {
-            vdupq_n_f16(0),
-            vdupq_n_f16(0)
-        }
-    };
-    if(stridex == 2)
-    {
-        const float16x8x2_t vtop     = vld2q_f16(in_top);
-        const float16x8x2_t vmid     = vld2q_f16(in_mid);
-        const float16x8x2_t vlow     = vld2q_f16(in_low);
-        const float16x8_t   vtop_end = vld1q_f16(in_top + 16);
-        const float16x8_t   vmid_end = vld1q_f16(in_mid + 16);
-        const float16x8_t   vlow_end = vld1q_f16(in_low + 16);
-
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2]));
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2]));
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2]));
-
-        accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
-    }
-    else
-    {
-        const float16x8x3_t vtop =
-        {
-            {
-                vld1q_f16(in_top),
-                vld1q_f16(in_top + 8),
-                vld1q_f16(in_top + 16)
-            }
-        };
-        const float16x8x3_t vmid =
-        {
-            {
-                vld1q_f16(in_mid),
-                vld1q_f16(in_mid + 8),
-                vld1q_f16(in_mid + 16)
-            }
-        };
-        const float16x8x3_t vlow =
-        {
-            {
-                vld1q_f16(in_low),
-                vld1q_f16(in_low + 8),
-                vld1q_f16(in_low + 16)
-            }
-        };
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
-
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));
-        out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
-        out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
-
-        if(stridex == 3)
-        {
-            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
-            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
-            out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
-
-            accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
-        }
-        else
-        {
-            accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
-        }
-    }
-}
-#endif /** __ARM_FEATURE_FP16_VECTOR_ARITHMETIC **/
-
-/** Get the number of elements processed on 3x3 convolution.
- *
- * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution.
- * @param[in] stridex                         Stride value in elements across x.
- *
- * @return The number of elements processed.
- */
-inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
-{
-    switch(stridex)
-    {
-        case 1:
-            return num_elems_written_per_iteration;
-        case 2:
-            return num_elems_written_per_iteration << 1;
-        case 3:
-            return num_elems_written_per_iteration * 3;
-        default:
-            ARM_COMPUTE_ERROR("stridex not supported");
-            return 0;
-    }
-}
-}
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/abs.h b/arm_compute/core/NEON/wrapper/intrinsics/abs.h
deleted file mode 100644
index aff18166f5..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/abs.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ABS_H
-#define ARM_COMPUTE_WRAPPER_ABS_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VABS_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vabs(const vtype &a)            \
-    {                                            \
-        return prefix##_##postfix(a);            \
-    }
-
-#define VQABS_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vqabs(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-// Absolute: vabs{q}_<type>. Vd[i] = |Va[i]|
-VABS_IMPL(int8x8_t, int8x8_t, vabs, s8)
-VABS_IMPL(int16x4_t, int16x4_t, vabs, s16)
-VABS_IMPL(int32x2_t, int32x2_t, vabs, s32)
-VABS_IMPL(float32x2_t, float32x2_t, vabs, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VABS_IMPL(float16x4_t, float16x4_t, vabs, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VABS_IMPL(int8x16_t, int8x16_t, vabsq, s8)
-VABS_IMPL(int16x8_t, int16x8_t, vabsq, s16)
-VABS_IMPL(int32x4_t, int32x4_t, vabsq, s32)
-VABS_IMPL(float32x4_t, float32x4_t, vabsq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VABS_IMPL(float16x8_t, float16x8_t, vabsq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-// Saturating absolute: vqabs{q}_<type>. Vd[i] = sat(|Va[i]|)
-VQABS_IMPL(int8x8_t, int8x8_t, vqabs, s8)
-VQABS_IMPL(int16x4_t, int16x4_t, vqabs, s16)
-VQABS_IMPL(int32x2_t, int32x2_t, vqabs, s32)
-
-VQABS_IMPL(int8x16_t, int8x16_t, vqabsq, s8)
-VQABS_IMPL(int16x8_t, int16x8_t, vqabsq, s16)
-VQABS_IMPL(int32x4_t, int32x4_t, vqabsq, s32)
-
-#undef VABS_IMPL
-#undef VQABS_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ABS_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h
deleted file mode 100644
index 776e136a56..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/add.h
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ADD_H
-#define ARM_COMPUTE_WRAPPER_ADD_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VADD_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vadd(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VADD_IMPL(uint8x8_t, uint8x8_t, vadd, u8)
-VADD_IMPL(int8x8_t, int8x8_t, vadd, s8)
-VADD_IMPL(uint16x4_t, uint16x4_t, vadd, u16)
-VADD_IMPL(int16x4_t, int16x4_t, vadd, s16)
-VADD_IMPL(uint32x2_t, uint32x2_t, vadd, u32)
-VADD_IMPL(int32x2_t, int32x2_t, vadd, s32)
-VADD_IMPL(uint64x1_t, uint64x1_t, vadd, u64)
-VADD_IMPL(int64x1_t, int64x1_t, vadd, s64)
-VADD_IMPL(float32x2_t, float32x2_t, vadd, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VADD_IMPL(float16x4_t, float16x4_t, vadd, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VADD_IMPL(uint8x16_t, uint8x16_t, vaddq, u8)
-VADD_IMPL(int8x16_t, int8x16_t, vaddq, s8)
-VADD_IMPL(uint16x8_t, uint16x8_t, vaddq, u16)
-VADD_IMPL(int16x8_t, int16x8_t, vaddq, s16)
-VADD_IMPL(uint32x4_t, uint32x4_t, vaddq, u32)
-VADD_IMPL(int32x4_t, int32x4_t, vaddq, s32)
-VADD_IMPL(uint64x2_t, uint64x2_t, vaddq, u64)
-VADD_IMPL(int64x2_t, int64x2_t, vaddq, s64)
-VADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VADD_IMPL
-
-// VQADD: Vector saturating add (No notion of saturation for floating point)
-#define VQADD_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vqadd(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8)
-VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8)
-VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16)
-VQADD_IMPL(int16x4_t, int16x4_t, vqadd, s16)
-VQADD_IMPL(uint32x2_t, uint32x2_t, vqadd, u32)
-VQADD_IMPL(int32x2_t, int32x2_t, vqadd, s32)
-VQADD_IMPL(uint64x1_t, uint64x1_t, vqadd, u64)
-VQADD_IMPL(int64x1_t, int64x1_t, vqadd, s64)
-VQADD_IMPL(float32x2_t, float32x2_t, vadd, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQADD_IMPL(float16x4_t, float16x4_t, vadd, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VQADD_IMPL(uint8x16_t, uint8x16_t, vqaddq, u8)
-VQADD_IMPL(int8x16_t, int8x16_t, vqaddq, s8)
-VQADD_IMPL(uint16x8_t, uint16x8_t, vqaddq, u16)
-VQADD_IMPL(int16x8_t, int16x8_t, vqaddq, s16)
-VQADD_IMPL(uint32x4_t, uint32x4_t, vqaddq, u32)
-VQADD_IMPL(int32x4_t, int32x4_t, vqaddq, s32)
-VQADD_IMPL(uint64x2_t, uint64x2_t, vqaddq, u64)
-VQADD_IMPL(int64x2_t, int64x2_t, vqaddq, s64)
-VQADD_IMPL(float32x4_t, float32x4_t, vaddq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VQADD_IMPL
-
-// VADDW: Vector widening add
-#define VADDW_IMPL(wtype, vtype, prefix, postfix)      \
-    inline wtype vaddw(const wtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8)
-VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8)
-VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16)
-VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16)
-VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32)
-VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32)
-#undef VADDW_IMPL
-
-// VADDL: Vector long add
-#define VADDL_IMPL(wtype, vtype, prefix, postfix)      \
-    inline wtype vaddl(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8)
-VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8)
-VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16)
-VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16)
-VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32)
-VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32)
-#undef VADDL_IMPL
-
-#if defined(__aarch64__)
-// VADDV: Across vector add
-#define VADDV_IMPL(stype, vtype, prefix, postfix) \
-    inline stype vaddv(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8)
-VADDV_IMPL(int8_t, int8x8_t, vaddv, s8)
-VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16)
-VADDV_IMPL(int16_t, int16x4_t, vaddv, s16)
-VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32)
-VADDV_IMPL(int32_t, int32x2_t, vaddv, s32)
-VADDV_IMPL(float, float32x2_t, vaddv, f32)
-
-VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8)
-VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8)
-VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16)
-VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16)
-VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32)
-VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32)
-VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64)
-VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64)
-VADDV_IMPL(float, float32x4_t, vaddvq, f32)
-#undef VADDV_IMPL
-#endif // defined(__aarch64__)
-
-// VPADDL: Signed add long pairwise
-#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \
-    inline ltype vpaddl(const vtype &a)            \
-    {                                              \
-        return prefix##_##postfix(a);              \
-    }
-
-VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8)
-VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8)
-VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16)
-VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16)
-VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32)
-VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32)
-
-VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8)
-VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8)
-VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16)
-VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16)
-VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32)
-VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32)
-#undef VPADDL_IMPL
-
-// VPADD: Add pairwise
-#define VPADD_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vpadd(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VPADD_IMPL(uint8x8_t, uint8x8_t, vpadd, u8)
-VPADD_IMPL(int8x8_t, int8x8_t, vpadd, s8)
-VPADD_IMPL(uint16x4_t, uint16x4_t, vpadd, u16)
-VPADD_IMPL(int16x4_t, int16x4_t, vpadd, s16)
-VPADD_IMPL(uint32x2_t, uint32x2_t, vpadd, u32)
-VPADD_IMPL(int32x2_t, int32x2_t, vpadd, s32)
-VPADD_IMPL(float32x2_t, float32x2_t, vpadd, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPADD_IMPL(float16x4_t, float16x4_t, vpadd, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPADD_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ADD_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/and.h b/arm_compute/core/NEON/wrapper/intrinsics/and.h
deleted file mode 100644
index 1973c5593d..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/and.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_AND_H
-#define ARM_COMPUTE_WRAPPER_AND_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VAND_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vand(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VAND_IMPL(uint8_t, uint8x8_t, vand, u8)
-VAND_IMPL(int8_t, int8x8_t, vand, s8)
-VAND_IMPL(uint16_t, uint16x4_t, vand, u16)
-VAND_IMPL(int16_t, int16x4_t, vand, s16)
-VAND_IMPL(uint32_t, uint32x2_t, vand, u32)
-VAND_IMPL(int32_t, int32x2_t, vand, s32)
-VAND_IMPL(uint64_t, uint64x1_t, vand, u64)
-VAND_IMPL(int64_t, int64x1_t, vand, s64)
-
-VAND_IMPL(uint8_t, uint8x16_t, vandq, u8)
-VAND_IMPL(int8_t, int8x16_t, vandq, s8)
-VAND_IMPL(uint16_t, uint16x8_t, vandq, u16)
-VAND_IMPL(int16_t, int16x8_t, vandq, s16)
-VAND_IMPL(uint32_t, uint32x4_t, vandq, u32)
-VAND_IMPL(int32_t, int32x4_t, vandq, s32)
-VAND_IMPL(uint64_t, uint64x2_t, vandq, u64)
-VAND_IMPL(int64_t, int64x2_t, vandq, s64)
-
-#undef VAND_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_AND_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h b/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
deleted file mode 100644
index 3c26a9c786..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/bsl.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_BSL_H
-#define ARM_COMPUTE_WRAPPER_BSL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VBSL_IMPL(stype, vtype, ctype, prefix, postfix)               \
-    inline vtype vbsl(const ctype &a, const vtype &b, const vtype &c) \
-    {                                                                 \
-        return prefix##_##postfix(a, b, c);                           \
-    }
-
-VBSL_IMPL(uint8_t, uint8x8_t, uint8x8_t, vbsl, u8)
-VBSL_IMPL(int8_t, int8x8_t, uint8x8_t, vbsl, s8)
-VBSL_IMPL(uint16_t, uint16x4_t, uint16x4_t, vbsl, u16)
-VBSL_IMPL(int16_t, int16x4_t, uint16x4_t, vbsl, s16)
-VBSL_IMPL(uint32_t, uint32x2_t, uint32x2_t, vbsl, u32)
-VBSL_IMPL(int32_t, int32x2_t, uint32x2_t, vbsl, s32)
-VBSL_IMPL(float32x2_t, float32x2_t, uint32x2_t, vbsl, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBSL_IMPL(float16x4_t, float16x4_t, uint16x4_t, vbsl, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VBSL_IMPL(uint8_t, uint8x16_t, uint8x16_t, vbslq, u8)
-VBSL_IMPL(int8_t, int8x16_t, uint8x16_t, vbslq, s8)
-VBSL_IMPL(uint16_t, uint16x8_t, uint16x8_t, vbslq, u16)
-VBSL_IMPL(int16_t, int16x8_t, uint16x8_t, vbslq, s16)
-VBSL_IMPL(uint32_t, uint32x4_t, uint32x4_t, vbslq, u32)
-VBSL_IMPL(int32_t, int32x4_t, uint32x4_t, vbslq, s32)
-VBSL_IMPL(float32x4_t, float32x4_t, uint32x4_t, vbslq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VBSL_IMPL(float16x8_t, float16x8_t, uint16x8_t, vbslq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VBSL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_BSL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h b/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
deleted file mode 100644
index f8a8f91f73..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/ceq.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CEQ_H
-#define ARM_COMPUTE_WRAPPER_CEQ_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCEQ_IMPL(votype, vtype, prefix, postfix)      \
-    inline votype vceq(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VCEQ_IMPL(uint8x8_t, uint8x8_t, vceq, u8)
-VCEQ_IMPL(uint8x8_t, int8x8_t, vceq, s8)
-VCEQ_IMPL(uint16x4_t, uint16x4_t, vceq, u16)
-VCEQ_IMPL(uint16x4_t, int16x4_t, vceq, s16)
-VCEQ_IMPL(uint32x2_t, uint32x2_t, vceq, u32)
-VCEQ_IMPL(uint32x2_t, int32x2_t, vceq, s32)
-VCEQ_IMPL(uint32x2_t, float32x2_t, vceq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCEQ_IMPL(uint16x4_t, float16x4_t, vceq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCEQ_IMPL(uint8x16_t, uint8x16_t, vceqq, u8)
-VCEQ_IMPL(uint8x16_t, int8x16_t, vceqq, s8)
-VCEQ_IMPL(uint16x8_t, uint16x8_t, vceqq, u16)
-VCEQ_IMPL(uint16x8_t, int16x8_t, vceqq, s16)
-VCEQ_IMPL(uint32x4_t, uint32x4_t, vceqq, u32)
-VCEQ_IMPL(uint32x4_t, int32x4_t, vceqq, s32)
-VCEQ_IMPL(uint32x4_t, float32x4_t, vceqq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCEQ_IMPL(uint16x8_t, float16x8_t, vceqq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCEQ_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CEQ_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cge.h b/arm_compute/core/NEON/wrapper/intrinsics/cge.h
deleted file mode 100644
index bf231b8b46..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/cge.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CGE_H
-#define ARM_COMPUTE_WRAPPER_CGE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCGE_IMPL(stype, vtype, rtype, prefix, postfix) \
-    inline rtype vcge(const vtype &a, const vtype &b)   \
-    {                                                   \
-        return prefix##_##postfix(a, b);                \
-    }
-
-VCGE_IMPL(uint8_t, uint8x8_t, uint8x8_t, vcge, u8)
-VCGE_IMPL(int8_t, int8x8_t, uint8x8_t, vcge, s8)
-VCGE_IMPL(uint16_t, uint16x4_t, uint16x4_t, vcge, u16)
-VCGE_IMPL(int16_t, int16x4_t, uint16x4_t, vcge, s16)
-VCGE_IMPL(uint32_t, uint32x2_t, uint32x2_t, vcge, u32)
-VCGE_IMPL(int32_t, int32x2_t, uint32x2_t, vcge, s32)
-VCGE_IMPL(float32x2_t, float32x2_t, uint32x2_t, vcge, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGE_IMPL(float16x4_t, float16x4_t, uint16x4_t, vcge, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCGE_IMPL(uint8_t, uint8x16_t, uint8x16_t, vcgeq, u8)
-VCGE_IMPL(int8_t, int8x16_t, uint8x16_t, vcgeq, s8)
-VCGE_IMPL(uint16_t, uint16x8_t, uint16x8_t, vcgeq, u16)
-VCGE_IMPL(int16_t, int16x8_t, uint16x8_t, vcgeq, s16)
-VCGE_IMPL(uint32_t, uint32x4_t, uint32x4_t, vcgeq, u32)
-VCGE_IMPL(int32_t, int32x4_t, uint32x4_t, vcgeq, s32)
-VCGE_IMPL(float32x4_t, float32x4_t, uint32x4_t, vcgeq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGE_IMPL(float16x8_t, float16x8_t, uint16x8_t, vcgeq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCGE_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CGE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h b/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
deleted file mode 100644
index 5202a5b21d..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/cgt.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CGT_H
-#define ARM_COMPUTE_WRAPPER_CGT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCGT_IMPL(rtype, vtype, prefix, postfix)      \
-    inline rtype vcgt(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VCGT_IMPL(uint8x8_t, uint8x8_t, vcgt, u8)
-VCGT_IMPL(uint8x8_t, int8x8_t, vcgt, s8)
-VCGT_IMPL(uint16x4_t, uint16x4_t, vcgt, u16)
-VCGT_IMPL(uint16x4_t, int16x4_t, vcgt, s16)
-VCGT_IMPL(uint32x2_t, uint32x2_t, vcgt, u32)
-VCGT_IMPL(uint32x2_t, int32x2_t, vcgt, s32)
-VCGT_IMPL(uint32x2_t, float32x2_t, vcgt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(uint16x4_t, float16x4_t, vcgt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCGT_IMPL(uint8x16_t, uint8x16_t, vcgtq, u8)
-VCGT_IMPL(uint8x16_t, int8x16_t, vcgtq, s8)
-VCGT_IMPL(uint16x8_t, uint16x8_t, vcgtq, u16)
-VCGT_IMPL(uint16x8_t, int16x8_t, vcgtq, s16)
-VCGT_IMPL(uint32x4_t, uint32x4_t, vcgtq, u32)
-VCGT_IMPL(uint32x4_t, int32x4_t, vcgtq, s32)
-VCGT_IMPL(uint32x4_t, float32x4_t, vcgtq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCGT_IMPL(uint16x8_t, float16x8_t, vcgtq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCGT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CGT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/clt.h b/arm_compute/core/NEON/wrapper/intrinsics/clt.h
deleted file mode 100644
index 4701ab7026..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/clt.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CLT_H
-#define ARM_COMPUTE_WRAPPER_CLT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCLT_IMPL(votype, vtype, prefix, postfix)      \
-    inline votype vclt(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VCLT_IMPL(uint8x8_t, uint8x8_t, vclt, u8)
-VCLT_IMPL(uint8x8_t, int8x8_t, vclt, s8)
-VCLT_IMPL(uint16x4_t, uint16x4_t, vclt, u16)
-VCLT_IMPL(uint16x4_t, int16x4_t, vclt, s16)
-VCLT_IMPL(uint32x2_t, uint32x2_t, vclt, u32)
-VCLT_IMPL(uint32x2_t, int32x2_t, vclt, s32)
-VCLT_IMPL(uint32x2_t, float32x2_t, vclt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCLT_IMPL(uint16x4_t, float16x4_t, vclt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VCLT_IMPL(uint8x16_t, uint8x16_t, vcltq, u8)
-VCLT_IMPL(uint8x16_t, int8x16_t, vcltq, s8)
-VCLT_IMPL(uint16x8_t, uint16x8_t, vcltq, u16)
-VCLT_IMPL(uint16x8_t, int16x8_t, vcltq, s16)
-VCLT_IMPL(uint32x4_t, uint32x4_t, vcltq, u32)
-VCLT_IMPL(uint32x4_t, int32x4_t, vcltq, s32)
-VCLT_IMPL(uint32x4_t, float32x4_t, vcltq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCLT_IMPL(uint16x8_t, float16x8_t, vcltq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCLT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CLT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/combine.h b/arm_compute/core/NEON/wrapper/intrinsics/combine.h
deleted file mode 100644
index 9099e28fc4..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/combine.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_COMBINE_H
-#define ARM_COMPUTE_WRAPPER_COMBINE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCOMBINE_IMPL(rtype, vtype, prefix, postfix)      \
-    inline rtype vcombine(const vtype &a, const vtype &b) \
-    {                                                     \
-        return prefix##_##postfix(a, b);                  \
-    }
-
-VCOMBINE_IMPL(uint8x16_t, uint8x8_t, vcombine, u8)
-VCOMBINE_IMPL(int8x16_t, int8x8_t, vcombine, s8)
-VCOMBINE_IMPL(uint16x8_t, uint16x4_t, vcombine, u16)
-VCOMBINE_IMPL(int16x8_t, int16x4_t, vcombine, s16)
-VCOMBINE_IMPL(uint32x4_t, uint32x2_t, vcombine, u32)
-VCOMBINE_IMPL(int32x4_t, int32x2_t, vcombine, s32)
-VCOMBINE_IMPL(float32x4_t, float32x2_t, vcombine, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VCOMBINE_IMPL(float16x8_t, float16x4_t, vcombine, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VCOMBINE_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_COMBINE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h b/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
deleted file mode 100644
index 5ea9a5dedd..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/cvt.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_CVT_H
-#define ARM_COMPUTE_WRAPPER_CVT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
-    template <typename T>                                                            \
-    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
-    vcvt(const vtype &a)                                                             \
-    {                                                                                \
-        return prefix##_##postfix1##_##postfix2(a);                                  \
-    }
-
-VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
-VCVT_TO_F32_IMPL(float32x4_t, int32x4_t, vcvtq, f32, s32)
-#undef VCVT_TO_F32_IMPL
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint32x4_t>::type
-vcvt(const float32x4_t &a)
-{
-    return vcvtq_u32_f32(a);
-}
-
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int32x4_t>::type
-vcvt(const float32x4_t &a)
-{
-    return vcvtq_s32_f32(a);
-}
-
-#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
-/** Convert 2x128-bit floating point vectors into 1x128-bit bfloat16 vector
- *
- * @param[in]     inptr  Pointer to the input memory to load values from
- * @param[in,out] outptr Pointer to the output memory to store values to
- */
-inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
-{
-    __asm __volatile(
-        "ldp    q0, q1, [%[inptr]]\n"
-        ".inst  0xea16800\n"  // BFCVTN v0, v0
-        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
-        "str    q0, [%[outptr]]\n"
-        : [inptr] "+r"(inptr)
-        : [outptr] "r"(outptr)
-        : "v0", "v1", "memory");
-}
-#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_CVT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/div.h b/arm_compute/core/NEON/wrapper/intrinsics/div.h
deleted file mode 100644
index d49a9113b0..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/div.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_DIV_H
-#define ARM_COMPUTE_WRAPPER_DIV_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#ifdef __aarch64__
-
-#define VDIV_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vdiv(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-VDIV_IMPL(float32x2_t, float32x2_t, vdiv, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x4_t, float16x4_t, vdiv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VDIV_IMPL(float32x4_t, float32x4_t, vdivq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x8_t, float16x8_t, vdivq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#else // __aarch64__
-
-#define VDIV_IMPL(stype, vtype, mul_prefix, inv_prefix, postfix)     \
-    inline vtype vdiv(const vtype &a, const vtype &b)                \
-    {                                                                \
-        return mul_prefix##_##postfix(a, inv_prefix##_##postfix(b)); \
-    }
-VDIV_IMPL(float32x2_t, float32x2_t, vmul, vinv, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x4_t, float16x4_t, vmul, vinv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VDIV_IMPL(float32x4_t, float32x4_t, vmulq, vinvq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDIV_IMPL(float16x8_t, float16x8_t, vmulq, vinvq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#endif // __aarch64__
-
-#undef VDIV_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_DIV_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h b/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
deleted file mode 100644
index ffbfde72c5..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/dup_n.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_DUP_N_H
-#define ARM_COMPUTE_WRAPPER_DUP_N_H
-
-#include "arm_compute/core/NEON/wrapper/traits.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VDUP_N_IMPL(stype, vtype, prefix, postfix, tag) \
-    inline vtype vdup_n(stype value, tag)               \
-    {                                                   \
-        return prefix##_##postfix(value);               \
-    }
-
-VDUP_N_IMPL(uint8_t, uint8x8_t, vdup_n, u8, traits::vector_64_tag)
-VDUP_N_IMPL(int8_t, int8x8_t, vdup_n, s8, traits::vector_64_tag)
-VDUP_N_IMPL(uint16_t, uint16x4_t, vdup_n, u16, traits::vector_64_tag)
-VDUP_N_IMPL(int16_t, int16x4_t, vdup_n, s16, traits::vector_64_tag)
-VDUP_N_IMPL(uint32_t, uint32x2_t, vdup_n, u32, traits::vector_64_tag)
-VDUP_N_IMPL(int32_t, int32x2_t, vdup_n, s32, traits::vector_64_tag)
-VDUP_N_IMPL(float, float32x2_t, vdup_n, f32, traits::vector_64_tag)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDUP_N_IMPL(float16_t, float16x4_t, vdup_n, f16, traits::vector_64_tag)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VDUP_N_IMPL(uint8_t, uint8x16_t, vdupq_n, u8, traits::vector_128_tag)
-VDUP_N_IMPL(int8_t, int8x16_t, vdupq_n, s8, traits::vector_128_tag)
-VDUP_N_IMPL(uint16_t, uint16x8_t, vdupq_n, u16, traits::vector_128_tag)
-VDUP_N_IMPL(int16_t, int16x8_t, vdupq_n, s16, traits::vector_128_tag)
-VDUP_N_IMPL(uint32_t, uint32x4_t, vdupq_n, u32, traits::vector_128_tag)
-VDUP_N_IMPL(int32_t, int32x4_t, vdupq_n, s32, traits::vector_128_tag)
-VDUP_N_IMPL(float, float32x4_t, vdupq_n, f32, traits::vector_128_tag)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VDUP_N_IMPL(float16_t, float16x8_t, vdupq_n, f16, traits::vector_128_tag)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VDUP_N_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_DUP_N_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
deleted file mode 100644
index a0e7b681ab..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/eor.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_EOR_H
-#define ARM_COMPUTE_WRAPPER_EOR_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VEOR_IMPL(vtype, prefix, postfix)             \
-    inline vtype veor(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VEOR_IMPL(uint8x8_t, veor, u8)
-VEOR_IMPL(int8x8_t, veor, s8)
-VEOR_IMPL(uint16x4_t, veor, u16)
-VEOR_IMPL(int16x4_t, veor, s16)
-VEOR_IMPL(uint32x2_t, veor, u32)
-VEOR_IMPL(int32x2_t, veor, s32)
-
-VEOR_IMPL(uint8x16_t, veorq, u8)
-VEOR_IMPL(int8x16_t, veorq, s8)
-VEOR_IMPL(uint16x8_t, veorq, u16)
-VEOR_IMPL(int16x8_t, veorq, s16)
-VEOR_IMPL(uint32x4_t, veorq, u32)
-VEOR_IMPL(int32x4_t, veorq, s32)
-
-#undef VEOR_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_EOR_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/exp.h b/arm_compute/core/NEON/wrapper/intrinsics/exp.h
deleted file mode 100644
index 4b17ebd93f..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/exp.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_EXP_H
-#define ARM_COMPUTE_WRAPPER_EXP_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VEXPQ_IMPL(vtype, postfix)     \
-    inline vtype vexpq(const vtype &a) \
-    {                                  \
-        return vexpq_##postfix(a);     \
-    }
-
-#define VEXPQ_IMPL_INT(vtype, postfix)      \
-    inline vtype vexpq(const vtype &a)      \
-    {                                       \
-        ARM_COMPUTE_UNUSED(a);              \
-        ARM_COMPUTE_ERROR("Not supported"); \
-    }
-
-VEXPQ_IMPL(float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VEXPQ_IMPL(float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VEXPQ_IMPL_INT(int32x4_t, s32)
-#undef VEXPQ_IMPL
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_EXP_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/ext.h b/arm_compute/core/NEON/wrapper/intrinsics/ext.h
deleted file mode 100644
index f2c3dcc901..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/ext.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_EXT_H
-#define ARM_COMPUTE_WRAPPER_EXT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VEXT_IMPL(vtype, prefix, postfix, size)            \
-    inline vtype vext_##size(vtype value_a, vtype value_b) \
-    {                                                      \
-        return prefix##_##postfix(value_a, value_b, size); \
-    }
-
-VEXT_IMPL(uint8x8_t, vext, u8, 1)
-VEXT_IMPL(uint8x8_t, vext, u8, 2)
-VEXT_IMPL(int8x8_t, vext, s8, 1)
-VEXT_IMPL(int8x8_t, vext, s8, 2)
-VEXT_IMPL(uint16x4_t, vext, u16, 1)
-VEXT_IMPL(uint16x4_t, vext, u16, 2)
-VEXT_IMPL(int16x4_t, vext, s16, 1)
-VEXT_IMPL(int16x4_t, vext, s16, 2)
-
-VEXT_IMPL(uint8x16_t, vextq, u8, 1)
-VEXT_IMPL(uint8x16_t, vextq, u8, 2)
-VEXT_IMPL(int8x16_t, vextq, s8, 1)
-VEXT_IMPL(int8x16_t, vextq, s8, 2)
-VEXT_IMPL(uint16x8_t, vextq, u16, 1)
-VEXT_IMPL(uint16x8_t, vextq, u16, 2)
-VEXT_IMPL(int16x8_t, vextq, s16, 1)
-VEXT_IMPL(int16x8_t, vextq, s16, 2)
-VEXT_IMPL(int32x4_t, vextq, s32, 1)
-VEXT_IMPL(int32x4_t, vextq, s32, 2)
-
-#undef VEXT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_EXT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h b/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
deleted file mode 100644
index 13d29677a6..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/gethigh.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_GET_HIGH_H
-#define ARM_COMPUTE_WRAPPER_GET_HIGH_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VGETHIGH_IMPL(half_vtype, vtype, postfix) \
-    inline half_vtype vgethigh(const vtype val)   \
-    {                                             \
-        return vget_high_##postfix(val);          \
-    }
-
-VGETHIGH_IMPL(uint8x8_t, uint8x16_t, u8)
-VGETHIGH_IMPL(int8x8_t, int8x16_t, s8)
-VGETHIGH_IMPL(uint16x4_t, uint16x8_t, u16)
-VGETHIGH_IMPL(int16x4_t, int16x8_t, s16)
-VGETHIGH_IMPL(uint32x2_t, uint32x4_t, u32)
-VGETHIGH_IMPL(int32x2_t, int32x4_t, s32)
-VGETHIGH_IMPL(float32x2_t, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETHIGH_IMPL(float16x4_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VGETHIGH_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_GET_HIGH_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h b/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
deleted file mode 100644
index 533bf63603..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlane.h
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_GET_LANE_H
-#define ARM_COMPUTE_WRAPPER_GET_LANE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vget_lane_##postfix(vector, 0);                 \
-            case 1:                                                    \
-                return vget_lane_##postfix(vector, 1);                 \
-            case 2:                                                    \
-                return vget_lane_##postfix(vector, 2);                 \
-            case 3:                                                    \
-                return vget_lane_##postfix(vector, 3);                 \
-            case 4:                                                    \
-                return vget_lane_##postfix(vector, 4);                 \
-            case 5:                                                    \
-                return vget_lane_##postfix(vector, 5);                 \
-            case 6:                                                    \
-                return vget_lane_##postfix(vector, 6);                 \
-            case 7:                                                    \
-                return vget_lane_##postfix(vector, 7);                 \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vget_lane_##postfix(vector, 0);                 \
-            case 1:                                                    \
-                return vget_lane_##postfix(vector, 1);                 \
-            case 2:                                                    \
-                return vget_lane_##postfix(vector, 2);                 \
-            case 3:                                                    \
-                return vget_lane_##postfix(vector, 3);                 \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vget_lane_##postfix(vector, 0);                 \
-            case 1:                                                    \
-                return vget_lane_##postfix(vector, 1);                 \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-VGETLANE_IMPL_8(uint8_t, uint8x8_t, u8)
-VGETLANE_IMPL_8(int8_t, int8x8_t, s8)
-VGETLANE_IMPL_4(uint16_t, uint16x4_t, u16)
-VGETLANE_IMPL_4(int16_t, int16x4_t, s16)
-VGETLANE_IMPL_2(uint32_t, uint32x2_t, u32)
-VGETLANE_IMPL_2(int32_t, int32x2_t, s32)
-VGETLANE_IMPL_2(float, float32x2_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETLANE_IMPL_4(float16_t, float16x4_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            case 2:                                                    \
-                return vgetq_lane_##postfix(vector, 2);                \
-            case 3:                                                    \
-                return vgetq_lane_##postfix(vector, 3);                \
-            case 4:                                                    \
-                return vgetq_lane_##postfix(vector, 4);                \
-            case 5:                                                    \
-                return vgetq_lane_##postfix(vector, 5);                \
-            case 6:                                                    \
-                return vgetq_lane_##postfix(vector, 6);                \
-            case 7:                                                    \
-                return vgetq_lane_##postfix(vector, 7);                \
-            case 8:                                                    \
-                return vgetq_lane_##postfix(vector, 8);                \
-            case 9:                                                    \
-                return vgetq_lane_##postfix(vector, 9);                \
-            case 10:                                                   \
-                return vgetq_lane_##postfix(vector, 10);               \
-            case 11:                                                   \
-                return vgetq_lane_##postfix(vector, 11);               \
-            case 12:                                                   \
-                return vgetq_lane_##postfix(vector, 12);               \
-            case 13:                                                   \
-                return vgetq_lane_##postfix(vector, 13);               \
-            case 14:                                                   \
-                return vgetq_lane_##postfix(vector, 14);               \
-            case 15:                                                   \
-                return vgetq_lane_##postfix(vector, 15);               \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            case 2:                                                    \
-                return vgetq_lane_##postfix(vector, 2);                \
-            case 3:                                                    \
-                return vgetq_lane_##postfix(vector, 3);                \
-            case 4:                                                    \
-                return vgetq_lane_##postfix(vector, 4);                \
-            case 5:                                                    \
-                return vgetq_lane_##postfix(vector, 5);                \
-            case 6:                                                    \
-                return vgetq_lane_##postfix(vector, 6);                \
-            case 7:                                                    \
-                return vgetq_lane_##postfix(vector, 7);                \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            case 2:                                                    \
-                return vgetq_lane_##postfix(vector, 2);                \
-            case 3:                                                    \
-                return vgetq_lane_##postfix(vector, 3);                \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-#define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
-    inline stype vgetlane(const vtype vector, const unsigned int lane) \
-    {                                                                  \
-        switch(lane)                                                   \
-        {                                                              \
-            case 0:                                                    \
-                return vgetq_lane_##postfix(vector, 0);                \
-            case 1:                                                    \
-                return vgetq_lane_##postfix(vector, 1);                \
-            default:                                                   \
-                ARM_COMPUTE_ERROR("Invalid lane");                     \
-        }                                                              \
-    }
-
-VGETQLANE_IMPL_16(uint8_t, uint8x16_t, u8)
-VGETQLANE_IMPL_16(int8_t, int8x16_t, s8)
-VGETQLANE_IMPL_8(uint16_t, uint16x8_t, u16)
-VGETQLANE_IMPL_8(int16_t, int16x8_t, s16)
-VGETQLANE_IMPL_4(uint32_t, uint32x4_t, u32)
-VGETQLANE_IMPL_4(int32_t, int32x4_t, s32)
-VGETQLANE_IMPL_4(float, float32x4_t, f32)
-VGETQLANE_IMPL_2(int64_t, int64x2_t, s64)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETQLANE_IMPL_8(float16_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VGETLANE_IMPL_8
-#undef VGETLANE_IMPL_4
-#undef VGETLANE_IMPL_2
-
-#undef VGETQLANE_IMPL_16
-#undef VGETQLANE_IMPL_8
-#undef VGETQLANE_IMPL_4
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_GET_LANE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h b/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
deleted file mode 100644
index dbc3d869e1..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/getlow.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_GET_LOW_H
-#define ARM_COMPUTE_WRAPPER_GET_LOW_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VGETLOW_IMPL(half_vtype, vtype, postfix) \
-    inline half_vtype vgetlow(const vtype val)   \
-    {                                            \
-        return vget_low_##postfix(val);          \
-    }
-
-VGETLOW_IMPL(uint8x8_t, uint8x16_t, u8)
-VGETLOW_IMPL(int8x8_t, int8x16_t, s8)
-VGETLOW_IMPL(uint16x4_t, uint16x8_t, u16)
-VGETLOW_IMPL(int16x4_t, int16x8_t, s16)
-VGETLOW_IMPL(uint32x2_t, uint32x4_t, u32)
-VGETLOW_IMPL(int32x2_t, int32x4_t, s32)
-VGETLOW_IMPL(float32x2_t, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VGETLOW_IMPL(float16x4_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VGETLOW_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_GET_LOW_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
deleted file mode 100644
index 1150daa073..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INTRINSICS_H
-#define ARM_COMPUTE_WRAPPER_INTRINSICS_H
-
-#include "arm_compute/core/NEON/wrapper/intrinsics/abs.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/add.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/and.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/bsl.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/ceq.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cge.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cgt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/clt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/cvt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/div.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/ext.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/getlow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/inv.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/load.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/log.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/max.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/min.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/mla.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/movl.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/movn.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/mul.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/neg.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/not.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/orr.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/qmov.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/qmovun.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/round.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/sin.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/store.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/sub.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/tanh.h"
-#include "arm_compute/core/NEON/wrapper/intrinsics/tbl.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_INTRINSICS_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/inv.h b/arm_compute/core/NEON/wrapper/intrinsics/inv.h
deleted file mode 100644
index 9da66baffa..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/inv.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INV_H
-#define ARM_COMPUTE_WRAPPER_INV_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VINV_IMPL(vtype, prefix, postfix) \
-    inline vtype vinv(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-#define VINV_IMPL_INT(vtype, prefix, postfix) \
-    inline vtype vinv(const vtype &a)         \
-    {                                         \
-        ARM_COMPUTE_UNUSED(a);                \
-        ARM_COMPUTE_ERROR("Not supported");   \
-    }
-
-VINV_IMPL(float32x2_t, vinv, f32)
-VINV_IMPL_INT(int32x2_t, vinv, s32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINV_IMPL(float16x4_t, vinv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VINV_IMPL(float32x4_t, vinvq, f32)
-VINV_IMPL_INT(int32x4_t, vinvq, s32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINV_IMPL(float16x8_t, vinvq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VINV_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_INV_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h b/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
deleted file mode 100644
index 77adcf7b8c..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/invsqrt.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_INVSQRT_H
-#define ARM_COMPUTE_WRAPPER_INVSQRT_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VINVSQRT_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vinvsqrt(const vtype &a)            \
-    {                                                \
-        return prefix##_##postfix(a);                \
-    }
-
-#define VINVSQRT_IMPL_INT(stype, vtype, prefix, postfix) \
-    inline vtype vinvsqrt(const vtype &a)                \
-    {                                                    \
-        ARM_COMPUTE_UNUSED(a);                           \
-        ARM_COMPUTE_ERROR("Not supported");              \
-    }
-
-VINVSQRT_IMPL(float, float32x2_t, vinvsqrt, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINVSQRT_IMPL(float16_t, float16x4_t, vinvsqrt, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINVSQRT_IMPL_INT(int, int32x4_t, vinvsqrt, s32)
-
-VINVSQRT_IMPL(float, float32x4_t, vinvsqrtq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VINVSQRT_IMPL(float16_t, float16x8_t, vinvsqrtq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VINVSQRT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_INVSQRT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/load.h b/arm_compute/core/NEON/wrapper/intrinsics/load.h
deleted file mode 100644
index d38350f05b..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/load.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_LOAD_H
-#define ARM_COMPUTE_WRAPPER_LOAD_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VLOAD_IMPL(stype, vtype, postfix) \
-    inline vtype vload(const stype *ptr)  \
-    {                                     \
-        return vld1_##postfix(ptr);       \
-    }
-
-VLOAD_IMPL(uint8_t, uint8x8_t, u8)
-VLOAD_IMPL(int8_t, int8x8_t, s8)
-VLOAD_IMPL(uint16_t, uint16x4_t, u16)
-VLOAD_IMPL(int16_t, int16x4_t, s16)
-VLOAD_IMPL(uint32_t, uint32x2_t, u32)
-VLOAD_IMPL(int32_t, int32x2_t, s32)
-//VLOAD_IMPL(uint64_t, uint64x1_t, u64)
-//VLOAD_IMPL(int64_t, int64x1_t, s64)
-VLOAD_IMPL(float, float32x2_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOAD_IMPL(float16_t, float16x4_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#define VLOADQ_IMPL(stype, vtype, postfix) \
-    inline vtype vloadq(const stype *ptr)  \
-    {                                      \
-        return vld1q_##postfix(ptr);       \
-    }
-
-VLOADQ_IMPL(uint8_t, uint8x16_t, u8)
-VLOADQ_IMPL(int8_t, int8x16_t, s8)
-VLOADQ_IMPL(uint16_t, uint16x8_t, u16)
-VLOADQ_IMPL(int16_t, int16x8_t, s16)
-VLOADQ_IMPL(uint32_t, uint32x4_t, u32)
-VLOADQ_IMPL(int32_t, int32x4_t, s32)
-//VLOAD_IMPL(uint64_t, uint64x1_t, u64)
-//VLOAD_IMPL(int64_t, int64x1_t, s64)
-VLOADQ_IMPL(float, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOADQ_IMPL(float16_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VLOAD_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_LOAD_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/log.h b/arm_compute/core/NEON/wrapper/intrinsics/log.h
deleted file mode 100644
index 682830c122..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/log.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_LOG_H
-#define ARM_COMPUTE_WRAPPER_LOG_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VLOG_IMPL(vtype, prefix, postfix) \
-    inline vtype vlog(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-#define VLOG_IMPL_INT(vtype, prefix, postfix) \
-    inline vtype vlog(const vtype &a)         \
-    {                                         \
-        ARM_COMPUTE_UNUSED(a);                \
-        ARM_COMPUTE_ERROR("Not supported");   \
-    }
-
-VLOG_IMPL(float32x4_t, vlogq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOG_IMPL(float16x8_t, vlogq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VLOG_IMPL_INT(int32x4_t, vlogq, s32)
-
-#undef VLOG_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_LOG_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/max.h b/arm_compute/core/NEON/wrapper/intrinsics/max.h
deleted file mode 100644
index a87b7a32b5..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/max.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MAX_H
-#define ARM_COMPUTE_WRAPPER_MAX_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMAX_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vmax(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VMAX_IMPL(uint8_t, uint8x8_t, vmax, u8)
-VMAX_IMPL(int8_t, int8x8_t, vmax, s8)
-VMAX_IMPL(uint16_t, uint16x4_t, vmax, u16)
-VMAX_IMPL(int16_t, int16x4_t, vmax, s16)
-VMAX_IMPL(uint32_t, uint32x2_t, vmax, u32)
-VMAX_IMPL(int32_t, int32x2_t, vmax, s32)
-VMAX_IMPL(float, float32x2_t, vmax, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMAX_IMPL(float16_t, float16x4_t, vmax, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMAX_IMPL(uint8_t, uint8x16_t, vmaxq, u8)
-VMAX_IMPL(int8_t, int8x16_t, vmaxq, s8)
-VMAX_IMPL(uint16_t, uint16x8_t, vmaxq, u16)
-VMAX_IMPL(int16_t, int16x8_t, vmaxq, s16)
-VMAX_IMPL(uint32_t, uint32x4_t, vmaxq, u32)
-VMAX_IMPL(int32_t, int32x4_t, vmaxq, s32)
-VMAX_IMPL(float, float32x4_t, vmaxq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMAX_IMPL(float16_t, float16x8_t, vmaxq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMAX_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MAX_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/min.h b/arm_compute/core/NEON/wrapper/intrinsics/min.h
deleted file mode 100644
index dc8a127e82..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/min.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MIN_H
-#define ARM_COMPUTE_WRAPPER_MIN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMIN_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vmin(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VMIN_IMPL(uint8_t, uint8x8_t, vmin, u8)
-VMIN_IMPL(int8_t, int8x8_t, vmin, s8)
-VMIN_IMPL(uint16_t, uint16x4_t, vmin, u16)
-VMIN_IMPL(int16_t, int16x4_t, vmin, s16)
-VMIN_IMPL(uint32_t, uint32x2_t, vmin, u32)
-VMIN_IMPL(int32_t, int32x2_t, vmin, s32)
-VMIN_IMPL(float, float32x2_t, vmin, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMIN_IMPL(float16_t, float16x4_t, vmin, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMIN_IMPL(uint8_t, uint8x16_t, vminq, u8)
-VMIN_IMPL(int8_t, int8x16_t, vminq, s8)
-VMIN_IMPL(uint16_t, uint16x8_t, vminq, u16)
-VMIN_IMPL(int16_t, int16x8_t, vminq, s16)
-VMIN_IMPL(uint32_t, uint32x4_t, vminq, u32)
-VMIN_IMPL(int32_t, int32x4_t, vminq, s32)
-VMIN_IMPL(float, float32x4_t, vminq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMIN_IMPL(float16_t, float16x8_t, vminq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMIN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MIN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mla.h b/arm_compute/core/NEON/wrapper/intrinsics/mla.h
deleted file mode 100644
index dd2f0c0d9d..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/mla.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MLA_H
-#define ARM_COMPUTE_WRAPPER_MLA_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMLA_IMPL(stype, vtype, prefix, postfix)                      \
-    inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \
-    {                                                                 \
-        return prefix##_##postfix(a, b, c);                           \
-    }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VMLA_IMPL2(stype, vtype, prefix1, prefix2, postfix)           \
-    inline vtype vmla(const vtype &a, const vtype &b, const vtype &c) \
-    {                                                                 \
-        return prefix1##_##postfix(a, prefix2##_##postfix(b, c));     \
-    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMLA_IMPL(uint8x8_t, uint8x8_t, vmla, u8)
-VMLA_IMPL(int8x8_t, int8x8_t, vmla, s8)
-VMLA_IMPL(uint16x4_t, uint16x4_t, vmla, u16)
-VMLA_IMPL(int16x4_t, int16x4_t, vmla, s16)
-VMLA_IMPL(uint32x2_t, uint32x2_t, vmla, u32)
-VMLA_IMPL(int32x2_t, int32x2_t, vmla, s32)
-VMLA_IMPL(float32x2_t, float32x2_t, vmla, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMLA_IMPL2(float16x4_t, float16x4_t, vadd, vmul, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMLA_IMPL(uint8x16_t, uint8x16_t, vmlaq, u8)
-VMLA_IMPL(int8x16_t, int8x16_t, vmlaq, s8)
-VMLA_IMPL(uint16x8_t, uint16x8_t, vmlaq, u16)
-VMLA_IMPL(int16x8_t, int16x8_t, vmlaq, s16)
-VMLA_IMPL(uint32x4_t, uint32x4_t, vmlaq, u32)
-VMLA_IMPL(int32x4_t, int32x4_t, vmlaq, s32)
-VMLA_IMPL(float32x4_t, float32x4_t, vmlaq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMLA_IMPL2(float16x8_t, float16x8_t, vaddq, vmulq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMLA_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MLA_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movl.h b/arm_compute/core/NEON/wrapper/intrinsics/movl.h
deleted file mode 100644
index 982a795924..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/movl.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MOVL_H
-#define ARM_COMPUTE_WRAPPER_MOVL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMOVL_IMPL(ptype, vtype, prefix, postfix) \
-    inline ptype vmovl(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-VMOVL_IMPL(uint16x8_t, uint8x8_t, vmovl, u8)
-VMOVL_IMPL(int16x8_t, int8x8_t, vmovl, s8)
-VMOVL_IMPL(uint32x4_t, uint16x4_t, vmovl, u16)
-VMOVL_IMPL(int32x4_t, int16x4_t, vmovl, s16)
-VMOVL_IMPL(uint64x2_t, uint32x2_t, vmovl, u32)
-VMOVL_IMPL(int64x2_t, int32x2_t, vmovl, s32)
-
-#undef VMOVL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MOVL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/movn.h b/arm_compute/core/NEON/wrapper/intrinsics/movn.h
deleted file mode 100644
index 23360e2597..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/movn.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MOVN_H
-#define ARM_COMPUTE_WRAPPER_MOVN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMOVN_IMPL(dtype, vtype, prefix, postfix) \
-    inline dtype vmovn(const vtype &a)            \
-    {                                             \
-        return prefix##_##postfix(a);             \
-    }
-
-VMOVN_IMPL(uint32x2_t, uint64x2_t, vmovn, u64)
-VMOVN_IMPL(int32x2_t, int64x2_t, vmovn, s64)
-VMOVN_IMPL(uint16x4_t, uint32x4_t, vmovn, u32)
-VMOVN_IMPL(int16x4_t, int32x4_t, vmovn, s32)
-VMOVN_IMPL(uint8x8_t, uint16x8_t, vmovn, u16)
-VMOVN_IMPL(int8x8_t, int16x8_t, vmovn, s16)
-
-#define VQMOVN_IMPL(dtype, vtype, prefix, postfix) \
-    inline dtype vqmovn(const vtype &a)            \
-    {                                              \
-        return prefix##_##postfix(a);              \
-    }
-
-VQMOVN_IMPL(uint32x2_t, uint64x2_t, vqmovn, u64)
-VQMOVN_IMPL(int32x2_t, int64x2_t, vqmovn, s64)
-VQMOVN_IMPL(uint16x4_t, uint32x4_t, vqmovn, u32)
-VQMOVN_IMPL(int16x4_t, int32x4_t, vqmovn, s32)
-VQMOVN_IMPL(uint8x8_t, uint16x8_t, vqmovn, u16)
-VQMOVN_IMPL(int8x8_t, int16x8_t, vqmovn, s16)
-
-#undef VMOVN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MOVN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/mul.h b/arm_compute/core/NEON/wrapper/intrinsics/mul.h
deleted file mode 100644
index bbf70abac9..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/mul.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_MUL_H
-#define ARM_COMPUTE_WRAPPER_MUL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VMUL_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vmul(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VMUL_IMPL(uint8x8_t, uint8x8_t, vmul, u8)
-VMUL_IMPL(int8x8_t, int8x8_t, vmul, s8)
-VMUL_IMPL(uint16x4_t, uint16x4_t, vmul, u16)
-VMUL_IMPL(int16x4_t, int16x4_t, vmul, s16)
-VMUL_IMPL(uint32x2_t, uint32x2_t, vmul, u32)
-VMUL_IMPL(int32x2_t, int32x2_t, vmul, s32)
-VMUL_IMPL(float32x2_t, float32x2_t, vmul, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMUL_IMPL(float16_t, float16x4_t, vmul, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VMUL_IMPL(uint8_t, uint8x16_t, vmulq, u8)
-VMUL_IMPL(int8_t, int8x16_t, vmulq, s8)
-VMUL_IMPL(uint16_t, uint16x8_t, vmulq, u16)
-VMUL_IMPL(int16_t, int16x8_t, vmulq, s16)
-VMUL_IMPL(uint32_t, uint32x4_t, vmulq, u32)
-VMUL_IMPL(int32_t, int32x4_t, vmulq, s32)
-VMUL_IMPL(float32x4_t, float32x4_t, vmulq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VMUL_IMPL(float16_t, float16x8_t, vmulq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VMUL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_MUL_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/neg.h b/arm_compute/core/NEON/wrapper/intrinsics/neg.h
deleted file mode 100644
index da2f285eca..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/neg.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_NEG_H
-#define ARM_COMPUTE_WRAPPER_NEG_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VNEG_IMPL(vtype, prefix, postfix) \
-    inline vtype vneg(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-VNEG_IMPL(int8x8_t, vneg, s8)
-VNEG_IMPL(int16x4_t, vneg, s16)
-VNEG_IMPL(int32x2_t, vneg, s32)
-VNEG_IMPL(float32x2_t, vneg, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNEG_IMPL(float16x4_t, vneg, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VNEG_IMPL(int8x16_t, vnegq, s8)
-VNEG_IMPL(int16x8_t, vnegq, s16)
-VNEG_IMPL(int32x4_t, vnegq, s32)
-VNEG_IMPL(float32x4_t, vnegq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNEG_IMPL(float16x8_t, vnegq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VNEG_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_NEG_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/not.h b/arm_compute/core/NEON/wrapper/intrinsics/not.h
deleted file mode 100644
index 5b1e4056ca..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/not.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_NOT_H
-#define ARM_COMPUTE_WRAPPER_NOT_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VNOT_IMPL(stype, vtype, prefix, postfix) \
-    inline vtype vnot(const vtype &a)            \
-    {                                            \
-        return prefix##_##postfix(a);            \
-    }
-
-VNOT_IMPL(uint8_t, uint8x8_t, vmvn, u8)
-VNOT_IMPL(int8_t, int8x8_t, vmvn, s8)
-VNOT_IMPL(uint16_t, uint16x4_t, vmvn, u16)
-VNOT_IMPL(int16_t, int16x4_t, vmvn, s16)
-VNOT_IMPL(uint32_t, uint32x2_t, vmvn, u32)
-VNOT_IMPL(int32_t, int32x2_t, vmvn, s32)
-VNOT_IMPL(float32x2_t, float32x2_t, vinv, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNOT_IMPL(float16x4_t, float16x4_t, vinv, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VNOT_IMPL(uint8_t, uint8x16_t, vmvnq, u8)
-VNOT_IMPL(int8_t, int8x16_t, vmvnq, s8)
-VNOT_IMPL(uint16_t, uint16x8_t, vmvnq, u16)
-VNOT_IMPL(int16_t, int16x8_t, vmvnq, s16)
-VNOT_IMPL(uint32_t, uint32x4_t, vmvnq, u32)
-VNOT_IMPL(int32_t, int32x4_t, vmvnq, s32)
-VNOT_IMPL(float32x4_t, float32x4_t, vinvq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VNOT_IMPL(float16x8_t, float16x8_t, vinvq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VNOT_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_NOT_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/orr.h b/arm_compute/core/NEON/wrapper/intrinsics/orr.h
deleted file mode 100644
index 0fbdd44c76..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/orr.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ORR_H
-#define ARM_COMPUTE_WRAPPER_ORR_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VORR_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vorr(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VORR_IMPL(uint8_t, uint8x8_t, vorr, u8)
-VORR_IMPL(int8_t, int8x8_t, vorr, s8)
-VORR_IMPL(uint16_t, uint16x4_t, vorr, u16)
-VORR_IMPL(int16_t, int16x4_t, vorr, s16)
-VORR_IMPL(uint32_t, uint32x2_t, vorr, u32)
-VORR_IMPL(int32_t, int32x2_t, vorr, s32)
-VORR_IMPL(uint64_t, uint64x1_t, vorr, u64)
-VORR_IMPL(int64_t, int64x1_t, vorr, s64)
-
-VORR_IMPL(uint8_t, uint8x16_t, vorrq, u8)
-VORR_IMPL(int8_t, int8x16_t, vorrq, s8)
-VORR_IMPL(uint16_t, uint16x8_t, vorrq, u16)
-VORR_IMPL(int16_t, int16x8_t, vorrq, s16)
-VORR_IMPL(uint32_t, uint32x4_t, vorrq, u32)
-VORR_IMPL(int32_t, int32x4_t, vorrq, s32)
-VORR_IMPL(uint64_t, uint64x2_t, vorrq, u64)
-VORR_IMPL(int64_t, int64x2_t, vorrq, s64)
-
-#undef VORR_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ORR_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h b/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
deleted file mode 100644
index afad27f1e4..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmax.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_PMAX_H
-#define ARM_COMPUTE_WRAPPER_PMAX_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VPMAX_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vpmax(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VPMAX_IMPL(uint8_t, uint8x8_t, vpmax, u8)
-VPMAX_IMPL(int8_t, int8x8_t, vpmax, s8)
-VPMAX_IMPL(uint16_t, uint16x4_t, vpmax, u16)
-VPMAX_IMPL(int16_t, int16x4_t, vpmax, s16)
-VPMAX_IMPL(uint32_t, uint32x2_t, vpmax, u32)
-VPMAX_IMPL(int32_t, int32x2_t, vpmax, s32)
-VPMAX_IMPL(float, float32x2_t, vpmax, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPMAX_IMPL(float16_t, float16x4_t, vpmax, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPMAX_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_PMAX_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h b/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
deleted file mode 100644
index 77c5cf61ba..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/pmin.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_PMIN_H
-#define ARM_COMPUTE_WRAPPER_PMIN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VPMIN_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vpmin(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VPMIN_IMPL(uint8_t, uint8x8_t, vpmin, u8)
-VPMIN_IMPL(int8_t, int8x8_t, vpmin, s8)
-VPMIN_IMPL(uint16_t, uint16x4_t, vpmin, u16)
-VPMIN_IMPL(int16_t, int16x4_t, vpmin, s16)
-VPMIN_IMPL(uint32_t, uint32x2_t, vpmin, u32)
-VPMIN_IMPL(int32_t, int32x2_t, vpmin, s32)
-VPMIN_IMPL(float, float32x2_t, vpmin, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPMIN_IMPL(float16_t, float16x4_t, vpmin, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPMIN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_PMIN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/pow.h b/arm_compute/core/NEON/wrapper/intrinsics/pow.h
deleted file mode 100644
index 1b5d62df5e..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/pow.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_POW_H
-#define ARM_COMPUTE_WRAPPER_POW_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VPOW_IMPL(vtype, prefix, postfix)             \
-    inline vtype vpow(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VPOW_IMPL(float32x4_t, vpowq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VPOW_IMPL(float16x8_t, vpowq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VPOW_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_POW_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h b/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
deleted file mode 100644
index a0347020db..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/qmovun.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_QMOVUN_H
-#define ARM_COMPUTE_WRAPPER_QMOVUN_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VQMOVUN_IMPL(dtype, vtype, prefix, postfix) \
-    inline dtype vqmovun(const vtype &a)            \
-    {                                               \
-        return prefix##_##postfix(a);               \
-    }
-
-VQMOVUN_IMPL(uint32x2_t, int64x2_t, vqmovun, s64)
-VQMOVUN_IMPL(uint16x4_t, int32x4_t, vqmovun, s32)
-VQMOVUN_IMPL(uint8x8_t, int16x8_t, vqmovun, s16)
-
-#undef VQMOVUN_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_QMOVUN_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
deleted file mode 100644
index 579da344a7..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_REINTERPRET_H
-#define ARM_COMPUTE_WRAPPER_REINTERPRET_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VREINTERPRET_IMPL(ptype, vtype, prefix, postfix1, postfix2) \
-    inline ptype vreinterpret(const vtype &a)                       \
-    {                                                               \
-        return prefix##_##postfix1##_##postfix2(a);                 \
-    }                                                               \
-    \
-    inline ptype vreinterpret(const ptype &a)                       \
-    {                                                               \
-        return a;                                                   \
-    }
-
-VREINTERPRET_IMPL(int16x4_t, uint16x4_t, vreinterpret, s16, u16)
-
-VREINTERPRET_IMPL(int32x4_t, uint32x4_t, vreinterpretq, s32, u32)
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_REINTERPRET_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h b/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
deleted file mode 100644
index 0385704f3f..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/rev64.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_REV64_H
-#define ARM_COMPUTE_WRAPPER_REV64_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VREV64_IMPL(vtype, prefix, postfix) \
-    inline vtype vrev64(const vtype &a)     \
-    {                                       \
-        return prefix##_##postfix(a);       \
-    }
-
-VREV64_IMPL(uint8x8_t, vrev64, u8)
-VREV64_IMPL(int8x8_t, vrev64, s8)
-VREV64_IMPL(uint16x4_t, vrev64, u16)
-VREV64_IMPL(int16x4_t, vrev64, s16)
-VREV64_IMPL(uint32x2_t, vrev64, u32)
-VREV64_IMPL(int32x2_t, vrev64, s32)
-VREV64_IMPL(float32x2_t, vrev64, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VREV64_IMPL(float16x4_t, vrev64, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VREV64_IMPL(uint8x16_t, vrev64q, u8)
-VREV64_IMPL(int8x16_t, vrev64q, s8)
-VREV64_IMPL(uint16x8_t, vrev64q, u16)
-VREV64_IMPL(int16x8_t, vrev64q, s16)
-VREV64_IMPL(uint32x4_t, vrev64q, u32)
-VREV64_IMPL(int32x4_t, vrev64q, s32)
-VREV64_IMPL(float32x4_t, vrev64q, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VREV64_IMPL(float16x8_t, vrev64q, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VREV64_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_REV64_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/round.h b/arm_compute/core/NEON/wrapper/intrinsics/round.h
deleted file mode 100644
index d6f5a88689..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/round.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_ROUND_H
-#define ARM_COMPUTE_WRAPPER_ROUND_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VROUNDQ_IMPL(vtype, postfix)     \
-    inline vtype vround(const vtype &a)  \
-    {                                    \
-        return vroundq_rte_##postfix(a); \
-    }
-
-#define VROUNDQ_IMPL_INT(vtype, postfix)    \
-    inline vtype vround(const vtype &a)     \
-    {                                       \
-        ARM_COMPUTE_UNUSED(a);              \
-        ARM_COMPUTE_ERROR("Not supported"); \
-    }
-
-VROUNDQ_IMPL(float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VROUNDQ_IMPL(float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VROUNDQ_IMPL_INT(int32x4_t, s32)
-#undef VROUNDQ_IMPL
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_ROUND_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h b/arm_compute/core/NEON/wrapper/intrinsics/setlane.h
deleted file mode 100644
index 6332f3025e..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/setlane.h
+++ /dev/null
@@ -1,208 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SET_LANE_H
-#define ARM_COMPUTE_WRAPPER_SET_LANE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vset_lane_##postfix(value, vector, 0);                             \
-            case 1:                                                                       \
-                return vset_lane_##postfix(value, vector, 1);                             \
-            case 2:                                                                       \
-                return vset_lane_##postfix(value, vector, 2);                             \
-            case 3:                                                                       \
-                return vset_lane_##postfix(value, vector, 3);                             \
-            case 4:                                                                       \
-                return vset_lane_##postfix(value, vector, 4);                             \
-            case 5:                                                                       \
-                return vset_lane_##postfix(value, vector, 5);                             \
-            case 6:                                                                       \
-                return vset_lane_##postfix(value, vector, 6);                             \
-            case 7:                                                                       \
-                return vset_lane_##postfix(value, vector, 7);                             \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vset_lane_##postfix(value, vector, 0);                             \
-            case 1:                                                                       \
-                return vset_lane_##postfix(value, vector, 1);                             \
-            case 2:                                                                       \
-                return vset_lane_##postfix(value, vector, 2);                             \
-            case 3:                                                                       \
-                return vset_lane_##postfix(value, vector, 3);                             \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vset_lane_##postfix(value, vector, 0);                             \
-            case 1:                                                                       \
-                return vset_lane_##postfix(value, vector, 1);                             \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-VSETLANE_IMPL_8(uint8x8_t, uint8_t, uint8x8_t, u8)
-VSETLANE_IMPL_8(int8x8_t, int8_t, int8x8_t, s8)
-VSETLANE_IMPL_4(uint16x4_t, uint16_t, uint16x4_t, u16)
-VSETLANE_IMPL_4(int16x4_t, int16_t, int16x4_t, s16)
-VSETLANE_IMPL_2(uint32x2_t, uint32_t, uint32x2_t, u32)
-VSETLANE_IMPL_2(int32x2_t, int32_t, int32x2_t, s32)
-VSETLANE_IMPL_2(float32x2_t, float, float32x2_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSETLANE_IMPL_4(float16x4_t, float16_t, float16x4_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 0);                            \
-            case 1:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 1);                            \
-            case 2:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 2);                            \
-            case 3:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 3);                            \
-            case 4:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 4);                            \
-            case 5:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 5);                            \
-            case 6:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 6);                            \
-            case 7:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 7);                            \
-            case 8:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 8);                            \
-            case 9:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 9);                            \
-            case 10:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 10);                           \
-            case 11:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 11);                           \
-            case 12:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 12);                           \
-            case 13:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 13);                           \
-            case 14:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 14);                           \
-            case 15:                                                                      \
-                return vsetq_lane_##postfix(value, vector, 15);                           \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 0);                            \
-            case 1:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 1);                            \
-            case 2:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 2);                            \
-            case 3:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 3);                            \
-            case 4:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 4);                            \
-            case 5:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 5);                            \
-            case 6:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 6);                            \
-            case 7:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 7);                            \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-#define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
-    inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
-    {                                                                                     \
-        switch(lane)                                                                      \
-        {                                                                                 \
-            case 0:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 0);                            \
-            case 1:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 1);                            \
-            case 2:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 2);                            \
-            case 3:                                                                       \
-                return vsetq_lane_##postfix(value, vector, 3);                            \
-            default:                                                                      \
-                ARM_COMPUTE_ERROR("Invalid lane");                                        \
-        }                                                                                 \
-    }
-
-VSETQLANE_IMPL_16(uint8x16_t, uint8_t, uint8x16_t, u8)
-VSETQLANE_IMPL_16(int8x16_t, int8_t, int8x16_t, s8)
-VSETQLANE_IMPL_8(uint16x8_t, uint16_t, uint16x8_t, u16)
-VSETQLANE_IMPL_8(int16x8_t, int16_t, int16x8_t, s16)
-VSETQLANE_IMPL_4(uint32x4_t, uint32_t, uint32x4_t, u32)
-VSETQLANE_IMPL_4(int32x4_t, int32_t, int32x4_t, s32)
-VSETQLANE_IMPL_4(float32x4_t, float, float32x4_t, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSETQLANE_IMPL_8(float16x8_t, float16_t, float16x8_t, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VSETLANE_IMPL_8
-#undef VSETLANE_IMPL_4
-#undef VSETLANE_IMPL_2
-
-#undef VSETQLANE_IMPL_16
-#undef VSETQLANE_IMPL_8
-#undef VSETQLANE_IMPL_4
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SET_LANE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sin.h b/arm_compute/core/NEON/wrapper/intrinsics/sin.h
deleted file mode 100644
index bca72db38a..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/sin.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SIN_H
-#define ARM_COMPUTE_WRAPPER_SIN_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSIN_IMPL(vtype, prefix, postfix) \
-    inline vtype vsin(const vtype &a)     \
-    {                                     \
-        return prefix##_##postfix(a);     \
-    }
-
-#define VSIN_IMPL_INT(vtype, prefix, postfix) \
-    inline vtype vsin(const vtype &a)         \
-    {                                         \
-        ARM_COMPUTE_UNUSED(a);                \
-        ARM_COMPUTE_ERROR("Not supported");   \
-    }
-
-VSIN_IMPL(float32x4_t, vsinq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSIN_IMPL(float16x8_t, vsinq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VSIN_IMPL_INT(int32x4_t, vsinq, s32)
-
-#undef vsub_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
-\ No newline at end of file
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/store.h b/arm_compute/core/NEON/wrapper/intrinsics/store.h
deleted file mode 100644
index eb2ae6a5e1..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/store.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_STORE_H
-#define ARM_COMPUTE_WRAPPER_STORE_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSTORE_IMPL(stype, vtype, prefix, postfix) \
-    inline void vstore(stype *ptr, vtype val)      \
-    {                                              \
-        prefix##_##postfix(ptr, val);              \
-    }
-
-VSTORE_IMPL(uint8_t, uint8x8_t, vst1, u8)
-VSTORE_IMPL(uint8_t, uint8x8x2_t, vst2, u8)
-VSTORE_IMPL(int8_t, int8x8_t, vst1, s8)
-VSTORE_IMPL(int8_t, int8x8x2_t, vst2, s8)
-VSTORE_IMPL(uint16_t, uint16x4_t, vst1, u16)
-VSTORE_IMPL(int16_t, int16x4_t, vst1, s16)
-VSTORE_IMPL(uint32_t, uint32x2_t, vst1, u32)
-VSTORE_IMPL(int32_t, int32x2_t, vst1, s32)
-//VSTORE_IMPL(uint64_t, 1, vst1, u64)
-//VSTORE_IMPL(int64_t, 1, vst1, s64)
-VSTORE_IMPL(float, float32x2_t, vst1, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSTORE_IMPL(float16_t, float16x4_t, vst1, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VSTORE_IMPL(uint8_t, uint8x16_t, vst1q, u8)
-VSTORE_IMPL(int8_t, int8x16_t, vst1q, s8)
-VSTORE_IMPL(uint16_t, uint16x8_t, vst1q, u16)
-VSTORE_IMPL(int16_t, int16x8_t, vst1q, s16)
-VSTORE_IMPL(uint32_t, uint32x4_t, vst1q, u32)
-VSTORE_IMPL(int32_t, int32x4_t, vst1q, s32)
-//VSTORE_IMPL(uint64_t, 2, vst1q, u64)
-//VSTORE_IMPL(int64_t, 2, vst1q, s64)
-VSTORE_IMPL(float, float32x4_t, vst1q, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSTORE_IMPL(float16_t, float16x8_t, vst1q, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VSTORE_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_STORE_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/sub.h b/arm_compute/core/NEON/wrapper/intrinsics/sub.h
deleted file mode 100644
index f46b57c815..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/sub.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SUB_H
-#define ARM_COMPUTE_WRAPPER_SUB_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VSUB_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vsub(const vtype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VSUB_IMPL(uint8x8_t, uint8x8_t, vsub, u8)
-VSUB_IMPL(int8x8_t, int8x8_t, vsub, s8)
-VSUB_IMPL(uint16x4_t, uint16x4_t, vsub, u16)
-VSUB_IMPL(int16x4_t, int16x4_t, vsub, s16)
-VSUB_IMPL(uint32x2_t, uint32x2_t, vsub, u32)
-VSUB_IMPL(int32x2_t, int32x2_t, vsub, s32)
-VSUB_IMPL(uint64x1_t, uint64x1_t, vsub, u64)
-VSUB_IMPL(int64x1_t, int64x1_t, vsub, s64)
-VSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VSUB_IMPL(uint8x16_t, uint8x16_t, vsubq, u8)
-VSUB_IMPL(int8x16_t, int8x16_t, vsubq, s8)
-VSUB_IMPL(uint16x8_t, uint16x8_t, vsubq, u16)
-VSUB_IMPL(int16x8_t, int16x8_t, vsubq, s16)
-VSUB_IMPL(uint32x4_t, uint32x4_t, vsubq, u32)
-VSUB_IMPL(int32x4_t, int32x4_t, vsubq, s32)
-VSUB_IMPL(uint64x2_t, uint64x2_t, vsubq, u64)
-VSUB_IMPL(int64x2_t, int64x2_t, vsubq, s64)
-VSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-#undef VSUB_IMPL
-
-// VQSUB: Vector saturating sub (No notion of saturation for floating point)
-#define VQSUB_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vqsub(const vtype &a, const vtype &b) \
-    {                                                  \
-        return prefix##_##postfix(a, b);               \
-    }
-
-VQSUB_IMPL(uint8x8_t, uint8x8_t, vqsub, u8)
-VQSUB_IMPL(int8x8_t, int8x8_t, vqsub, s8)
-VQSUB_IMPL(uint16x4_t, uint16x4_t, vqsub, u16)
-VQSUB_IMPL(int16x4_t, int16x4_t, vqsub, s16)
-VQSUB_IMPL(uint32x2_t, uint32x2_t, vqsub, u32)
-VQSUB_IMPL(int32x2_t, int32x2_t, vqsub, s32)
-VQSUB_IMPL(uint64x1_t, uint64x1_t, vqsub, u64)
-VQSUB_IMPL(int64x1_t, int64x1_t, vqsub, s64)
-VQSUB_IMPL(float32x2_t, float32x2_t, vsub, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQSUB_IMPL(float16x4_t, float16x4_t, vsub, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-VQSUB_IMPL(uint8x16_t, uint8x16_t, vqsubq, u8)
-VQSUB_IMPL(int8x16_t, int8x16_t, vqsubq, s8)
-VQSUB_IMPL(uint16x8_t, uint16x8_t, vqsubq, u16)
-VQSUB_IMPL(int16x8_t, int16x8_t, vqsubq, s16)
-VQSUB_IMPL(uint32x4_t, uint32x4_t, vqsubq, u32)
-VQSUB_IMPL(int32x4_t, int32x4_t, vqsubq, s32)
-VQSUB_IMPL(uint64x2_t, uint64x2_t, vqsubq, u64)
-VQSUB_IMPL(int64x2_t, int64x2_t, vqsubq, s64)
-VQSUB_IMPL(float32x4_t, float32x4_t, vsubq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VQSUB_IMPL(float16x8_t, float16x8_t, vsubq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VQSUB_IMPL
-
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h b/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
deleted file mode 100644
index 648a001ca7..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/tanh.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_TANH_H
-#define ARM_COMPUTE_WRAPPER_TANH_H
-
-#include "arm_compute/core/NEON/NEMath.h"
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VTANH_IMPL(vtype, prefix, postfix) \
-    inline vtype vtanh(const vtype &a)     \
-    {                                      \
-        return prefix##_##postfix(a);      \
-    }
-
-VTANH_IMPL(float32x4_t, vtanhq, f32)
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-VTANH_IMPL(float16x8_t, vtanhq, f16)
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#undef VTANH_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TANH_H */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h b/arm_compute/core/NEON/wrapper/intrinsics/tbl.h
deleted file mode 100644
index d3d6b72e6a..0000000000
--- a/arm_compute/core/NEON/wrapper/intrinsics/tbl.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_TBL_H
-#define ARM_COMPUTE_WRAPPER_TBL_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-#define VTBL_IMPL(stype, vtype, prefix, postfix)      \
-    inline vtype vtbl(const stype &a, const vtype &b) \
-    {                                                 \
-        return prefix##_##postfix(a, b);              \
-    }
-
-VTBL_IMPL(uint8x8x2_t, uint8x8_t, vtbl2, u8)
-VTBL_IMPL(int8x8x2_t, int8x8_t, vtbl2, s8)
-
-#undef VTBL_IMPL
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TBL_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/add.h b/arm_compute/core/NEON/wrapper/scalar/add.h
deleted file mode 100644
index 5a04fe20fa..0000000000
--- a/arm_compute/core/NEON/wrapper/scalar/add.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SCALAR_ADD_H
-#define ARM_COMPUTE_WRAPPER_SCALAR_ADD_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
-{
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
-    return vget_lane_u8(vqadd_u8(va, vb), 0);
-}
-
-inline int16_t add_sat(const int16_t &a, const int16_t &b)
-{
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
-    return vget_lane_s16(vqadd_s16(va, vb), 0);
-}
-
-inline float add_sat(const float &a, const float &b)
-{
-    // No notion of saturation exists in floating point
-    return a + b;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16_t add_sat(const float16_t &a, const float16_t &b)
-{
-    // No notion of saturation exists in floating point
-    return a + b;
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SCALAR_ADD_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/scalar.h b/arm_compute/core/NEON/wrapper/scalar/scalar.h
deleted file mode 100644
index ff2d807c0e..0000000000
--- a/arm_compute/core/NEON/wrapper/scalar/scalar.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SCALAR_H
-#define ARM_COMPUTE_WRAPPER_SCALAR_H
-
-#include "arm_compute/core/NEON/wrapper/scalar/add.h"
-#include "arm_compute/core/NEON/wrapper/scalar/sub.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_SCALAR_H */
diff --git a/arm_compute/core/NEON/wrapper/scalar/sub.h b/arm_compute/core/NEON/wrapper/scalar/sub.h
deleted file mode 100644
index 5b4cab93d3..0000000000
--- a/arm_compute/core/NEON/wrapper/scalar/sub.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
-#define ARM_COMPUTE_WRAPPER_SCALAR_SUB_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
-{
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
-    return vget_lane_u8(vqsub_u8(va, vb), 0);
-}
-
-inline int16_t sub_sat(const int16_t &a, const int16_t &b)
-{
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
-    return vget_lane_s16(vqsub_s16(va, vb), 0);
-}
-
-inline float sub_sat(const float &a, const float &b)
-{
-    // No notion of saturation exists in floating point
-    return a - b;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-inline float16_t sub_sat(const float16_t &a, const float16_t &b)
-{
-    // No notion of saturation exists in floating point
-    return a - b;
-}
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SCALAR_SUB_H */
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
deleted file mode 100644
index ae77d2778c..0000000000
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_TRAITS_H
-#define ARM_COMPUTE_WRAPPER_TRAITS_H
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace wrapper
-{
-namespace traits
-{
-// *INDENT-OFF*
-// clang-format off
-
-/** 64-bit vector tag */
-struct vector_64_tag {};
-/** 128-bit vector tag */
-struct vector_128_tag {};
-
-/** Create the appropriate NEON vector given its type and size in terms of elements */
-template <typename T, int S> struct neon_vector;
-
-// Specializations
-#ifndef DOXYGEN_SKIP_THIS
-template <> struct neon_vector<uint8_t, 8>{ using scalar_type = uint8_t; using type = uint8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int8_t, 8>{ using scalar_type = int8_t; using type = int8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint8_t, 16>{ using scalar_type = uint8_t; using type = uint8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int8_t, 16>{ using scalar_type = int8_t; using type = int8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<uint16_t, 4>{ using scalar_type = uint16_t; using type = uint16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int16_t, 4>{ using scalar_type = int16_t; using type = int16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint16_t, 8>{ using scalar_type = uint16_t; using type = uint16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<uint16_t, 16>{ using scalar_type = uint16_t; using type = uint16x8x2_t; };
-template <> struct neon_vector<int16_t, 8>{ using scalar_type = int16_t; using type = int16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int16_t, 16>{ using scalar_type = int16_t; using type = int16x8x2_t; };
-template <> struct neon_vector<uint32_t, 2>{ using scalar_type = uint32_t; using type = uint32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int32_t, 2>{ using scalar_type = int32_t; using type = int32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint32_t, 4>{ using scalar_type = uint32_t; using type = uint32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int32_t, 4>{ using scalar_type = int32_t; using type = int32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<uint64_t, 1>{ using scalar_type = uint64_t;using type = uint64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<int64_t, 1>{ using scalar_type = int64_t; using type = int64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<uint64_t, 2>{ using scalar_type = uint64_t; using type = uint64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<int64_t, 2>{ using scalar_type = int64_t; using type = int64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_vector<float_t, 2>{ using scalar_type = float_t; using type = float32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<float_t, 4>{ using scalar_type = float_t; using type = float32x4_t; using tag_type = vector_128_tag; };
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <> struct neon_vector<float16_t, 4>{ using scalar_type = float16_t; using type = float16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_vector<float16_t, 8>{ using scalar_type = float16_t; using type = float16x8_t; using tag_type = vector_128_tag; };
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#endif /* DOXYGEN_SKIP_THIS */
-
-/**  Helper type template to get the type of a neon vector */
-template <typename T, int S> using neon_vector_t = typename neon_vector<T, S>::type;
-/**  Helper type template to get the tag type of a neon vector */
-template <typename T, int S> using neon_vector_tag_t = typename neon_vector<T, S>::tag_type;
-
-/** Vector bit-width enum class */
-enum class BitWidth
-{
-    W64,  /**< 64-bit width */
-    W128, /**< 128-bit width */
-};
-
-/** Create the appropriate NEON vector given its type and size in terms of bits */
-template <typename T, BitWidth BW> struct neon_bitvector;
-// Specializations
-#ifndef DOXYGEN_SKIP_THIS
-template <> struct neon_bitvector<uint8_t, BitWidth::W64>{ using type = uint8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int8_t, BitWidth::W64>{ using type = int8x8_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint8_t, BitWidth::W128>{ using type = uint8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int8_t, BitWidth::W128>{ using type = int8x16_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<uint16_t, BitWidth::W64>{ using type = uint16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int16_t, BitWidth::W64>{ using type = int16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint16_t, BitWidth::W128>{ using type = uint16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int16_t, BitWidth::W128>{ using type = int16x8_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<uint32_t, BitWidth::W64>{ using type = uint32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int32_t, BitWidth::W64>{ using type = int32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint32_t, BitWidth::W128>{ using type = uint32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int32_t, BitWidth::W128>{ using type = int32x4_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<uint64_t, BitWidth::W64>{ using type = uint64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<int64_t, BitWidth::W64>{ using type = int64x1_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<uint64_t, BitWidth::W128>{ using type = uint64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<int64_t, BitWidth::W128>{ using type = int64x2_t; using tag_type = vector_128_tag; };
-template <> struct neon_bitvector<float_t, BitWidth::W64>{ using type = float32x2_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<float_t, BitWidth::W128>{ using type = float32x4_t; using tag_type = vector_128_tag; };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <> struct neon_bitvector<float16_t, BitWidth::W64>{ using type = float16x4_t; using tag_type = vector_64_tag; };
-template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float16x8_t; using tag_type = vector_128_tag; };
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#endif /* DOXYGEN_SKIP_THIS */
-
-/**  Helper type template to get the type of a neon vector */
-template <typename T, BitWidth BW> using neon_bitvector_t = typename neon_bitvector<T, BW>::type;
-/**  Helper type template to get the tag type of a neon vector */
-template <typename T, BitWidth BW> using neon_bitvector_tag_t = typename neon_bitvector<T, BW>::tag_type;
-
-/** Promote a type */
-template <typename T> struct promote { };
-template <> struct promote<uint8_t> { using type = uint16_t; };
-template <> struct promote<int8_t> { using type = int16_t; };
-template <> struct promote<uint16_t> { using type = uint32_t; };
-template <> struct promote<int16_t> { using type = int32_t; };
-template <> struct promote<uint32_t> { using type = uint64_t; };
-template <> struct promote<int32_t> { using type = int64_t; };
-template <> struct promote<float> { using type = float; };
-template <> struct promote<half> { using type = half; };
-
-/** Get promoted type */
-template <typename T>
-using promote_t = typename promote<T>::type;
-
-// clang-format on
-// *INDENT-ON*
-} // namespace traits
-} // namespace wrapper
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_TRAITS_H */
diff --git a/arm_compute/core/NEON/wrapper/wrapper.h b/arm_compute/core/NEON/wrapper/wrapper.h
deleted file mode 100644
index 99a5909e8b..0000000000
--- a/arm_compute/core/NEON/wrapper/wrapper.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_WRAPPER_H
-#define ARM_COMPUTE_WRAPPER_H
-
-// Traits
-#include "arm_compute/core/NEON/wrapper/traits.h"
-
-// Intrinsics Overloads
-#include "arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include "arm_compute/core/NEON/wrapper/scalar/scalar.h"
-
-#endif /* ARM_COMPUTE_WRAPPER_H */
diff --git a/arm_compute/core/PixelValue.h b/arm_compute/core/PixelValue.h
index 337ccbc3f7..0b4df4f2e2 100644
--- a/arm_compute/core/PixelValue.h
+++ b/arm_compute/core/PixelValue.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_PIXELVALUE_H
 #define ARM_COMPUTE_PIXELVALUE_H
 
+#include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Types.h"
 
 #include <cstdint>
@@ -35,8 +36,7 @@ class PixelValue
 {
 public:
     /** Default constructor: value initialized to 0 */
-    PixelValue()
-        : value{ int64_t(0) }
+    PixelValue() noexcept : value{int64_t(0)}
     {
     }
     /** Initialize the union with a pixel value of chosen datatype
@@ -45,10 +45,9 @@ public:
      * @param[in] datatype DataType that @p v have to be stored
      * @param[in] qinfo    (Optional) QuantizationInfo to apply in case of quantized data types to @p v
      */
-    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo())
-        : PixelValue()
+    PixelValue(double v, DataType datatype, QuantizationInfo qinfo = QuantizationInfo()) : PixelValue()
     {
-        switch(datatype)
+        switch (datatype)
         {
             case DataType::U8:
                 value.u8 = static_cast<uint8_t>(v);
@@ -108,8 +107,7 @@ public:
      *
      * @param[in] v S8 value.
      */
-    PixelValue(int8_t v)
-        : PixelValue()
+    PixelValue(int8_t v) : PixelValue()
     {
         value.s8 = v;
     }
@@ -117,8 +115,7 @@ public:
      *
      * @param[in] v U8 value.
      */
-    PixelValue(uint8_t v)
-        : PixelValue()
+    PixelValue(uint8_t v) : PixelValue()
     {
         value.u8 = v;
     }
@@ -126,8 +123,7 @@ public:
      *
      * @param[in] v U16 value.
      */
-    PixelValue(uint16_t v)
-        : PixelValue()
+    PixelValue(uint16_t v) : PixelValue()
     {
         value.u16 = v;
     }
@@ -135,8 +131,7 @@ public:
      *
      * @param[in] v S16 value.
      */
-    PixelValue(int16_t v)
-        : PixelValue()
+    PixelValue(int16_t v) : PixelValue()
     {
         value.s16 = v;
     }
@@ -144,8 +139,7 @@ public:
      *
      * @param[in] v U32 value.
      */
-    PixelValue(uint32_t v)
-        : PixelValue()
+    PixelValue(uint32_t v) : PixelValue()
     {
         value.u32 = v;
     }
@@ -153,8 +147,7 @@ public:
      *
      * @param[in] v S32 value.
      */
-    PixelValue(int32_t v)
-        : PixelValue()
+    PixelValue(int32_t v) : PixelValue()
     {
         value.s32 = v;
     }
@@ -163,8 +156,7 @@ public:
      *
      * @param[in] v U64 value.
      */
-    PixelValue(uint64_t v)
-        : PixelValue()
+    PixelValue(uint64_t v) : PixelValue()
     {
         value.u64 = v;
     }
@@ -172,8 +164,7 @@ public:
      *
      * @param[in] v S64 value.
      */
-    PixelValue(int64_t v)
-        : PixelValue()
+    PixelValue(int64_t v) : PixelValue()
     {
         value.s64 = v;
     }
@@ -181,8 +172,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(bfloat16 v)
-        : PixelValue()
+    PixelValue(bfloat16 v) : PixelValue()
     {
         value.bf16 = v;
     }
@@ -190,8 +180,7 @@ public:
      *
      * @param[in] v F16 value.
      */
-    PixelValue(half v)
-        : PixelValue()
+    PixelValue(half v) : PixelValue()
     {
         value.f16 = v;
     }
@@ -199,8 +188,7 @@ public:
      *
      * @param[in] v F32 value.
      */
-    PixelValue(float v)
-        : PixelValue()
+    PixelValue(float v) : PixelValue()
     {
         value.f32 = v;
     }
@@ -208,8 +196,7 @@ public:
      *
      * @param[in] v F64 value.
      */
-    PixelValue(double v)
-        : PixelValue()
+    PixelValue(double v) : PixelValue()
     {
         value.f64 = v;
     }
@@ -217,23 +204,23 @@ public:
      * Use the field corresponding to the image format
      */
     union
-        {
-            uint64_t u64;     /**< Single channel U64 */
-            int64_t  s64;     /**< Single channel S64 */
-            uint8_t  rgb[3];  /**< 3 channels: RGB888 */
-            uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
-            uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
-            double   f64;     /**< Single channel double */
-            float    f32;     /**< Single channel float 32 */
-            half     f16;     /**< Single channel F16 */
-            bfloat16 bf16;    /**< Single channel brain floating-point number */
-            uint8_t  u8;      /**< Single channel U8 */
-            int8_t   s8;      /**< Single channel S8 */
-            uint16_t u16;     /**< Single channel U16 */
-            int16_t  s16;     /**< Single channel S16 */
-            uint32_t u32;     /**< Single channel U32 */
-            int32_t  s32;     /**< Single channel S32 */
-        } value;
+    {
+        uint64_t u64;     /**< Single channel U64 */
+        int64_t  s64;     /**< Single channel S64 */
+        uint8_t  rgb[3];  /**< 3 channels: RGB888 */
+        uint8_t  yuv[3];  /**< 3 channels: Any YUV format */
+        uint8_t  rgbx[4]; /**< 4 channels: RGBX8888 */
+        double   f64;     /**< Single channel double */
+        float    f32;     /**< Single channel float 32 */
+        half     f16;     /**< Single channel F16 */
+        bfloat16 bf16;    /**< Single channel brain floating-point number */
+        uint8_t  u8;      /**< Single channel U8 */
+        int8_t   s8;      /**< Single channel S8 */
+        uint16_t u16;     /**< Single channel U16 */
+        int16_t  s16;     /**< Single channel S16 */
+        uint32_t u32;     /**< Single channel U32 */
+        int32_t  s32;     /**< Single channel S32 */
+    } value;
     /** Interpret the pixel value as a U8
      *
      * @param[out] v Returned value
diff --git a/arm_compute/core/PyramidInfo.h b/arm_compute/core/PyramidInfo.h
deleted file mode 100644
index e8cbe3488a..0000000000
--- a/arm_compute/core/PyramidInfo.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_PYRAMIDINFO_H
-#define ARM_COMPUTE_PYRAMIDINFO_H
-
-#include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Store the Pyramid's metadata */
-class PyramidInfo
-{
-public:
-    /** Default constructor */
-    PyramidInfo();
-    /** Default destructor */
-    virtual ~PyramidInfo() = default;
-    /** Allow instances of this class to be copy constructed */
-    PyramidInfo(const PyramidInfo &) = default;
-    /** Allow instances of this class to be copied */
-    PyramidInfo &operator=(const PyramidInfo &) = default;
-    /** Allow instances of this class to be move constructed */
-    PyramidInfo(PyramidInfo &&) = default;
-    /** Allow instances of this class to be moved */
-    PyramidInfo &operator=(PyramidInfo &&) = default;
-
-    /** Create pyramid info for 2D tensors
-     *
-     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale      Used to indicate the scale between the pyramid levels.
-     *                       This is required to be a non-zero positive value.
-     * @param[in] width      The width of the 2D tensor at 0th pyramid level
-     * @param[in] height     The height of the 2D tensor at 0th pyramid level
-     * @param[in] format     The format of all 2D tensors in the pyramid
-     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
-     */
-    PyramidInfo(size_t num_levels, float scale, size_t width, size_t height, Format format);
-
-    /** Create pyramid info using TensorShape
-     *
-     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale        Used to indicate the scale between the pyramid levels.
-     *                         This is required to be a non-zero positive value.
-     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
-     * @param[in] format       The format of all tensors in the pyramid
-     */
-    PyramidInfo(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
-
-    /** Initialize pyramid's metadata for 2D tensors
-     *
-     * @param[in] num_levels The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale      Used to indicate the scale between the pyramid levels.
-     *                       This is required to be a non-zero positive value.
-     * @param[in] width      The width of the 2D tensor at 0th pyramid level
-     * @param[in] height     The height of the 2D tensor at 0th pyramid level
-     * @param[in] format     The format of all 2D tensors in the pyramid
-     *                       NV12, NV21, IYUV, UYVY and YUYV formats are not supported.
-     */
-    void init(size_t num_levels, float scale, size_t width, size_t height, Format format);
-    /** Initialize pyramid's metadata using TensorShape
-     *
-     * @param[in] num_levels   The number of pyramid levels. This is required to be a non-zero value
-     * @param[in] scale        Used to indicate the scale between the pyramid levels.
-     *                         This is required to be a non-zero positive value.
-     * @param[in] tensor_shape It specifies the size for each dimension of the tensor 0th pyramid level in number of elements
-     * @param[in] format       The format of all tensors in the pyramid
-     */
-    void init(size_t num_levels, float scale, const TensorShape &tensor_shape, Format format);
-    /** Return the number of the pyramid levels
-     *
-     *  @return The number of the pyramid levels
-     */
-    size_t num_levels() const;
-    /** Return the width of the 0th level tensor
-     *
-     *  @return The width of the 0th level tensor
-     */
-    size_t width() const;
-    /** Return the height of the 0th level tensor
-     *
-     *  @return The height of the 0th level tensor
-     */
-    size_t height() const;
-    /** Return the TensorShape of the o-th level tensor
-     *
-     * @return
-     */
-    const TensorShape &tensor_shape() const;
-    /** Return the image format of all tensor in the pyramid
-     *
-     *  @return The image format
-     */
-    Format format() const;
-    /** Return the scale factor of the pyramid
-     *
-     *  @return Return the scale factor
-     */
-    float scale() const;
-
-private:
-    size_t      _num_levels;
-    TensorShape _tensor_shape;
-    Format      _format;
-    float       _scale;
-};
-}
-#endif /*ARM_COMPUTE_PYRAMIDINFO_H */
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 52ef149e9b..aecba3712e 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_QUANTIZATION_INFO_H
-#define ARM_COMPUTE_QUANTIZATION_INFO_H
+#ifndef ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
+#define ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
 
 #include "arm_compute/core/Rounding.h"
-#include "utils/misc/Utility.h"
-#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/Utility.h"
+
+#include "support/ToolchainSupport.h"
 
-#include <cstddef>
-#include <type_traits>
 #include <vector>
 
 namespace arm_compute
@@ -43,8 +42,7 @@ using qasymm16_t       = uint16_t; /**< 16 bit quantized asymmetric scalar value
 struct UniformQuantizationInfo
 {
     /** Default constructor */
-    UniformQuantizationInfo()
-        : scale(0.f), offset(0)
+    UniformQuantizationInfo() : scale(0.f), offset(0)
     {
     }
     /** Constructor
@@ -52,8 +50,7 @@ struct UniformQuantizationInfo
      * @param[in] scale  Quantization scale
      * @param[in] offset Quantization offset
      */
-    UniformQuantizationInfo(float scale, int32_t offset)
-        : scale(scale), offset(offset)
+    UniformQuantizationInfo(float scale, int32_t offset) : scale(scale), offset(offset)
     {
     }
     /** Checks if the scale and offset are both zero */
@@ -71,9 +68,7 @@ class QuantizationInfo
 {
 public:
     /** Default constructor */
-    QuantizationInfo() noexcept
-        : _scale(),
-          _offset()
+    QuantizationInfo() noexcept : _scale(), _offset()
     {
     }
     /** Construct quantization info.
@@ -82,19 +77,19 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(float scale)
-        : _scale(1, scale), _offset()
+    QuantizationInfo(float scale) : _scale(1, scale), _offset()
     {
     }
     /** Construct quantization info.
      *
      * @note Used for asymmetric quantization
      *
-     * @param[in] scale  Scale.
-     * @param[in] offset Offset.
+     * @param[in] scale      Scale.
+     * @param[in] offset     Offset.
+     * @param[in] is_dynamic Whether this QuantizationInfo is dynamic, i.e. the scale and offset may change.
      */
-    QuantizationInfo(float scale, int offset)
-        : _scale(1, scale), _offset(1, offset)
+    QuantizationInfo(float scale, int offset, bool is_dynamic = false)
+        : _scale(1, scale), _offset(1, offset), _is_dynamic(is_dynamic)
     {
     }
     /** Construct quantization info.
@@ -103,19 +98,19 @@ public:
      *
      * @param[in] scale Scale.
      */
-    QuantizationInfo(std::vector<float> scale)
-        : _scale(scale), _offset()
+    QuantizationInfo(std::vector<float> scale) : _scale(scale), _offset()
     {
     }
     /** Construct quantization info.
      *
      * @note Used for asymmetric per channel quantization
      *
-     * @param[in] scale  Scale.
-     * @param[in] offset Offset.
+     * @param[in] scale      Scale.
+     * @param[in] offset     Offset.
+     * @param[in] is_dynamic Whether this QuantizationInfo is dynamic, i.e. the scale and offset may change.
      */
-    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset)
-        : _scale(scale), _offset(offset)
+    QuantizationInfo(std::vector<float> scale, std::vector<int32_t> offset, bool is_dynamic = false)
+        : _scale(scale), _offset(offset), _is_dynamic(is_dynamic)
     {
     }
     /** Scale vector accessor
@@ -134,6 +129,14 @@ public:
     {
         return _offset;
     }
+    /** is_dynamic accessor
+     *
+     * @return If true, the scale and offset may change, so operators will need to read on every run
+     */
+    bool is_dynamic() const
+    {
+        return _is_dynamic;
+    }
     /** Indicates whether this QuantizationInfo has valid settings or not
      *
      * @return True if the this has invalid settings.
@@ -158,6 +161,8 @@ public:
 private:
     std::vector<float>   _scale;  /**< Vector containing scaling factors */
     std::vector<int32_t> _offset; /**< Vector containing zero offsets */
+    bool                 _is_dynamic =
+        false; /**< If true, the scale and offset may change, so operators will need to read on every run */
 };
 
 /** Check whether two quantization info are equal.
@@ -210,20 +215,39 @@ inline bool operator!=(const UniformQuantizationInfo &lhs, const UniformQuantiza
 template <typename QUANTIZED_TYPE = uint8_t>
 struct Qasymm8QuantizationHelper
 {
-    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value
-                  || std::is_same<QUANTIZED_TYPE, int8_t>::value,
+    static_assert(std::is_same<QUANTIZED_TYPE, uint8_t>::value || std::is_same<QUANTIZED_TYPE, int8_t>::value,
                   "quantized type should be either uint8_t or int8_t.");
 
     /** Quantize a value given a 8-bit asymmetric quantization scheme
      *
+     * @param[in] value Value to quantize
+     * @param[in] qinfo Quantization information to use for quantizing
+     *
+     * @return Quantized value
+     */
+    static inline QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo)
+    {
+        ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
+        const int quantized = support::cpp11::lround(value / qinfo.scale) + qinfo.offset;
+        return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
+    }
+
+    /** Quantize a value given a 8-bit asymmetric quantization scheme using a specific rounding policy
+     *
      * @param[in] value           Value to quantize
      * @param[in] qinfo           Quantization information to use for quantizing
-     * @param[in] rounding_policy (Optional) Rounding policy to use. Default: nearest up
+     * @param[in] rounding_policy Rounding policy to use
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy)
     {
+        if (rounding_policy == RoundingPolicy::TO_NEAREST_UP)
+        {
+            return quantize(value, qinfo);
+        }
+
         ARM_COMPUTE_ERROR_ON(qinfo.scale == 0);
         const int quantized = arm_compute::round(value / qinfo.scale, rounding_policy) + qinfo.offset;
         return static_cast<QUANTIZED_TYPE>(arm_compute::utility::clamp<decltype(quantized), QUANTIZED_TYPE>(quantized));
@@ -237,7 +261,8 @@ struct Qasymm8QuantizationHelper
      *
      * @return Quantized value
      */
-    static inline QUANTIZED_TYPE quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+    static inline QUANTIZED_TYPE
+    quantize(float value, const QuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
     {
         const UniformQuantizationInfo uqinfo = qinfo.uniform();
         ARM_COMPUTE_ERROR_ON(uqinfo.scale == 0);
@@ -280,7 +305,8 @@ struct Qasymm8QuantizationHelper
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint8_t
+quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<uint8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -294,7 +320,9 @@ inline uint8_t quantize_qasymm8(float value, const INFO_TYPE &qinfo, RoundingPol
  * @return Quantized value
  */
 template <typename INFO_TYPE>
-inline int8_t quantize_qasymm8_signed(float value, const INFO_TYPE &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int8_t quantize_qasymm8_signed(float            value,
+                                      const INFO_TYPE &qinfo,
+                                      RoundingPolicy   rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     return Qasymm8QuantizationHelper<int8_t>::quantize(value, qinfo, rounding_policy);
 }
@@ -416,6 +444,19 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
     return (static_cast<int>(value) - offset) * scale;
 }
 
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value  Value to dequantize
+ * @param[in] scale  Scale to use for dequantization
+ * @param[in] offset Zero-offset to use for dequantization
+ *
+ * @return Dequantized value
+ */
+inline float dequantize(int32_t value, float scale, int32_t offset)
+{
+    return (static_cast<int>(value) - offset) * scale;
+}
+
 /** Quantize a value given a 16-bit symmetric quantization scheme
  *
  * @param[in] value           Value to quantize
@@ -424,7 +465,9 @@ inline float dequantize(uint16_t value, float scale, int32_t offset)
  *
  * @return Quantized value
  */
-inline int16_t quantize_qsymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline int16_t quantize_qsymm16(float                          value,
+                                const UniformQuantizationInfo &qinfo,
+                                RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy);
     quantized     = arm_compute::utility::clamp<int, int16_t>(quantized);
@@ -475,7 +518,9 @@ inline float dequantize_qsymm16(int16_t value, const QuantizationInfo &qinfo)
  *
  * @return Quantized value
  */
-inline uint16_t quantize_qasymm16(float value, const UniformQuantizationInfo &qinfo, RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_UP)
+inline uint16_t quantize_qasymm16(float                          value,
+                                  const UniformQuantizationInfo &qinfo,
+                                  RoundingPolicy                 rounding_policy = RoundingPolicy::TO_NEAREST_UP)
 {
     int quantized = arm_compute::round(value / qinfo.scale, rounding_policy) + qinfo.offset;
     quantized     = arm_compute::utility::clamp<int, uint16_t>(quantized);
@@ -518,6 +563,31 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
     return dequantize_qasymm16(value, qinfo.uniform());
 }
 
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] qinfo Quantization information to use for dequantizing
+ *
+ * @return Dequantized value
+ */
+inline float dequantize_s32(int32_t value, const UniformQuantizationInfo &qinfo)
+{
+    return (static_cast<int>(value) - qinfo.offset) * qinfo.scale;
+}
+
+/** Dequantize a value given a 32-bit asymmetric quantization scheme
+ *
+ * @param[in] value Value to dequantize
+ * @param[in] qinfo Quantization information to use for dequantizing
+ *
+ * @return Dequantized value
+ */
+
+inline float dequantize_s32(int32_t value, const QuantizationInfo &qinfo)
+{
+    return dequantize_s32(value, qinfo.uniform());
+}
+
 /*
  * In case of requantization of a quantized input tensor to an output tensor with another quantization
  * instead of applying dequantization and then a quantization functions, we just compute new scale and
@@ -548,7 +618,8 @@ inline float dequantize_qasymm16(uint16_t value, const QuantizationInfo &qinfo)
  * z_n = - z_i * s_i / s_o + z_o
  *
  */
-inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in, const UniformQuantizationInfo &uqinfo_out)
+inline UniformQuantizationInfo compute_requantization_scale_offset(const UniformQuantizationInfo &uqinfo_in,
+                                                                   const UniformQuantizationInfo &uqinfo_out)
 {
     float   scale_to_apply  = uqinfo_out.scale;
     int32_t offset_to_apply = uqinfo_out.offset;
@@ -562,4 +633,4 @@ inline UniformQuantizationInfo compute_requantization_scale_offset(const Uniform
 }
 
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_QUANTIZATION_INFO_H */
+#endif // ACL_ARM_COMPUTE_CORE_QUANTIZATIONINFO_H
diff --git a/arm_compute/core/Rounding.h b/arm_compute/core/Rounding.h
index 68d742907b..30a5a0fe9d 100644
--- a/arm_compute/core/Rounding.h
+++ b/arm_compute/core/Rounding.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,5 +42,5 @@ enum class RoundingPolicy
  * @return Rounded value of the argument x.
  */
 int round(float x, RoundingPolicy rounding_policy);
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ROUNDING_H */
diff --git a/arm_compute/core/Size2D.h b/arm_compute/core/Size2D.h
index 722d7450f6..672b392050 100644
--- a/arm_compute/core/Size2D.h
+++ b/arm_compute/core/Size2D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,8 +41,7 @@ public:
      * @param[in] w Width of the image or rectangle
      * @param[in] h Height of the image or rectangle
      */
-    Size2D(size_t w, size_t h)
-        : width(w), height(h)
+    Size2D(size_t w, size_t h) noexcept : width(w), height(h)
     {
     }
     /** The area of the image or rectangle calculated as (width * height)
@@ -89,5 +88,5 @@ public:
     size_t width  = {}; /**< Width of the image region or rectangle */
     size_t height = {}; /**< Height of the image region or rectangle */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SIZE2D_H */
diff --git a/arm_compute/core/Size3D.h b/arm_compute/core/Size3D.h
new file mode 100644
index 0000000000..e2dc6fe012
--- /dev/null
+++ b/arm_compute/core/Size3D.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_SIZE3D_H
+#define ARM_COMPUTE_SIZE3D_H
+
+#include <string>
+
+namespace arm_compute
+{
+/** Class for specifying the size of a 3D shape or object */
+class Size3D
+{
+public:
+    /** Default constructor */
+    Size3D() = default;
+    /** Constructor. Initializes "width", "height" and "depth" respectively with "w", "h" and "d"
+     *
+     * @param[in] w Width of the 3D shape or object
+     * @param[in] h Height of the 3D shape or object
+     * @param[in] d Depth of the 3D shape or object
+     */
+    Size3D(size_t w, size_t h, size_t d) noexcept : width(w), height(h), depth(d)
+    {
+    }
+
+    /** Convert the values stored to string
+     *
+     * @return string of (width x height x depth).
+     */
+    std::string to_string() const;
+
+    /** Semantic accessor for width as x.
+     *
+     * @return x.
+     */
+    size_t x() const
+    {
+        return width;
+    }
+
+    /** Semantic accessor for height as y.
+     *
+     * @return y.
+     */
+    size_t y() const
+    {
+        return height;
+    }
+
+    /** Semantic accessor for depth as z.
+     *
+     * @return z.
+     */
+    size_t z() const
+    {
+        return depth;
+    }
+
+    bool operator!=(const Size3D &other) const
+    {
+        return !(*this == other);
+    }
+
+    bool operator==(const Size3D &other) const
+    {
+        return (width == other.width) && (height == other.height) && (depth == other.depth);
+    }
+
+public:
+    size_t width  = {}; /**< Width of the 3D shape or object */
+    size_t height = {}; /**< Height of the 3D shape or object */
+    size_t depth  = {}; /**< Depth of the 3D shape or object */
+};
+
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_SIZE3D_H */
diff --git a/arm_compute/core/Steps.h b/arm_compute/core/Steps.h
index 6c89185a1f..6b261becc0 100644
--- a/arm_compute/core/Steps.h
+++ b/arm_compute/core/Steps.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,8 +45,7 @@ public:
      * @param[in] steps Values to initialize the steps.
      */
     template <typename... Ts>
-    Steps(Ts... steps)
-        : Dimensions{ steps... }
+    Steps(Ts... steps) : Dimensions{steps...}
     {
         // Initialize empty dimensions to 1
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
@@ -62,5 +61,5 @@ public:
     /** Default destructor */
     ~Steps() = default;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_STEPS_H*/
diff --git a/arm_compute/core/Strides.h b/arm_compute/core/Strides.h
index a2a73377ea..627b219987 100644
--- a/arm_compute/core/Strides.h
+++ b/arm_compute/core/Strides.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,7 @@
 #include <algorithm>
 #include <array>
 #include <cstddef>
+#include <cstdint>
 
 namespace arm_compute
 {
@@ -42,8 +43,7 @@ public:
      * @param[in] strides Values to initialize the strides.
      */
     template <typename... Ts>
-    constexpr Strides(Ts... strides)
-        : Dimensions{ strides... }
+    constexpr Strides(Ts... strides) : Dimensions{strides...}
     {
     }
     /** Allow instances of this class to be copy constructed */
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index bcb570ae7f..7a3ee2cfd0 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,12 @@
 #ifndef ARM_COMPUTE_SUBTENSORINFO_H
 #define ARM_COMPUTE_SUBTENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
-#include "arm_compute/core/Validate.h"
 
 #include <cstddef>
 #include <memory>
@@ -74,7 +72,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override
+    ITensorInfo                 &set_data_type(DataType data_type) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_data_type(data_type);
@@ -99,6 +97,7 @@ public:
         return *this;
     };
     ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo &set_tensor_dims_state(const TensorDimsState &state) override;
     ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
@@ -116,7 +115,13 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->auto_padding();
     };
+
+    ITensorInfo &set_lock_paddings(bool flag) override;
+
+    bool lock_paddings() const override;
+
     bool extend_padding(const PaddingSize &padding) override;
+
     size_t dimension(size_t index) const override
     {
         return _tensor_shape[index];
@@ -137,7 +142,7 @@ public:
         return _parent->offset_element_in_bytes(_coords);
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->element_size();
@@ -156,6 +161,11 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _tensor_shape;
     }
+    const TensorDimsState &tensor_dims_state() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _dims_state;
+    }
     DataType data_type() const override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
@@ -191,16 +201,21 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->is_dynamic();
     }
+    bool are_values_constant() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->are_values_constant();
+    }
     ITensorInfo &set_is_resizable(bool is_resizable) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_is_resizable(is_resizable);
         return *this;
     }
-    ITensorInfo &set_is_dynamic(bool is_dynamic) override
+    ITensorInfo &set_are_values_constant(bool are_values_constant) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
-        _parent->set_is_dynamic(is_dynamic);
+        _parent->set_are_values_constant(are_values_constant);
         return *this;
     }
     ValidRegion valid_region() const override
@@ -211,7 +226,7 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         // Check if subtensor is valid if parent is configured
-        if(_parent->tensor_shape().total_size() != 0)
+        if (_parent->tensor_shape().total_size() != 0)
         {
             ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(_parent->valid_region(), valid_region);
         }
@@ -227,13 +242,26 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->data_layout();
     }
+    ITensorInfo::Id id() const override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        return _parent->id();
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        ARM_COMPUTE_ERROR_ON(_parent == nullptr);
+        _parent->set_id(id);
+        return *this;
+    }
 
 private:
-    ITensorInfo *_parent;
-    TensorShape  _tensor_shape;
-    Coordinates  _coords;
-    ValidRegion  _valid_region;
-    bool         _extend_parent;
+    ITensorInfo    *_parent;
+    TensorShape     _tensor_shape;
+    TensorDimsState _dims_state;
+    Coordinates     _coords;
+    ValidRegion     _valid_region;
+    bool            _extend_parent;
+    bool            _lock_paddings;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_SUBTENSORINFO_H */
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 68570d58db..b18f750427 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,19 @@
 #ifndef ARM_COMPUTE_TENSORINFO_H
 #define ARM_COMPUTE_TENSORINFO_H
 
-#include "arm_compute/core/ITensorInfo.h"
-
-#include "ITensorInfo.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Strides.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
 
+#include "ITensorInfo.h"
 #include <cstddef>
 #include <memory>
 
 namespace arm_compute
 {
-class HOGInfo;
-
 /** Store the tensor's metadata */
 class TensorInfo final : public ITensorInfo
 {
@@ -52,7 +48,7 @@ public:
     /** Allow instances of this class to be copy constructed */
     TensorInfo(const ITensorInfo &info);
     /** Allow instances of this class to be copy constructed */
-    TensorInfo(const TensorInfo &) = default;
+    TensorInfo(const TensorInfo &);
     /** Allow instances of this class to be copied */
     TensorInfo &operator=(const TensorInfo &) = default;
     /** Allow instances of this class to be move constructed */
@@ -115,15 +111,10 @@ public:
      * @param[in] data_type         Data type to use for each tensor element
      * @param[in] quantization_info The quantization settings for the tensor data.
      */
-    TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info);
-
-    /** Constructor
-     *
-     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
-     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
-     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
-     */
-    TensorInfo(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+    TensorInfo(const TensorShape &tensor_shape,
+               size_t             num_channels,
+               DataType           data_type,
+               QuantizationInfo   quantization_info);
 
     /** Initialize the tensor info with just a format.
      *
@@ -147,7 +138,11 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, Format format, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes, size_t total_size_in_bytes);
+    void init(const TensorShape &tensor_shape,
+              Format             format,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
 
     /** Initialize the tensor info with just a format.
      *
@@ -175,15 +170,12 @@ public:
      * @param[in] offset_first_element_in_bytes Offset in bytes from the beginning of memory allocation to access the first element.
      * @param[in] total_size_in_bytes           Size in bytes of the memory allocation (including the offset to the first element).
      */
-    void init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-              size_t total_size_in_bytes);
-    /** Initialize the metadata structure for the given HOG's metadata
-     *
-     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
-     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
-     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
-     */
-    void init(const HOGInfo &hog_info, unsigned int width, unsigned int height);
+    void init(const TensorShape &tensor_shape,
+              size_t             num_channels,
+              DataType           data_type,
+              const Strides     &strides_in_bytes,
+              size_t             offset_first_element_in_bytes,
+              size_t             total_size_in_bytes);
     /** Initialize the metadata structure for the given tensor shape and single-plane format, (Padding is automatically calculated)
      *
      * @note The padding used by this method is really conservative so that the tensor can be used for most functions.
@@ -206,30 +198,22 @@ public:
      * @return Total allocation size including padding in bytes.
      */
     size_t init_auto_padding(const TensorShape &tensor_shape, size_t num_channels, DataType data_type);
-    /** Initialize the metadata structure for the given HOG's metadata
-     *
-     * @note init_auto_padding will be used for the tensor initialization.
-     *
-     * @param[in] hog_info HOG's metadata used to allocate normalized HOG space
-     * @param[in] width    Width of the 2D tensor where the HOG descriptor will be computed on
-     * @param[in] height   Height of the 2D tensor where the HOG descriptor will be computed on
-     *
-     * @return Total allocation size including padding in bytes.
-     */
-    size_t init_auto_padding(const HOGInfo &hog_info, unsigned int width, unsigned int height);
 
     // Inherited methods overridden:
     std::unique_ptr<ITensorInfo> clone() const override;
-    ITensorInfo &set_data_type(DataType data_type) override;
-    ITensorInfo &set_num_channels(int num_channels) override;
-    ITensorInfo &set_format(Format format) override;
-    ITensorInfo &set_tensor_shape(const TensorShape &shape) override;
-    ITensorInfo &set_quantization_info(const QuantizationInfo &quantization_info) override;
-    ITensorInfo &set_data_layout(const DataLayout &data_layout) override;
-    ITensorInfo &reset_padding() override;
-    bool         auto_padding() override;
-    bool extend_padding(const PaddingSize &padding) override;
-    size_t dimension(size_t index) const override
+    ITensorInfo                 &set_data_type(DataType data_type) override;
+    ITensorInfo                 &set_num_channels(int num_channels) override;
+    ITensorInfo                 &set_format(Format format) override;
+    ITensorInfo                 &set_tensor_shape(const TensorShape &shape) override;
+    ITensorInfo                 &set_tensor_dims_state(const TensorDimsState &state) override;
+    ITensorInfo                 &set_quantization_info(const QuantizationInfo &quantization_info) override;
+    ITensorInfo                 &set_data_layout(const DataLayout &data_layout) override;
+    ITensorInfo                 &reset_padding() override;
+    bool                         auto_padding() override;
+    ITensorInfo                 &set_lock_paddings(bool flag) override;
+    bool                         lock_paddings() const override;
+    bool                         extend_padding(const PaddingSize &padding) override;
+    size_t                       dimension(size_t index) const override
     {
         return _tensor_shape[index];
     }
@@ -246,7 +230,7 @@ public:
         return _offset_first_element_in_bytes;
     }
     int32_t offset_element_in_bytes(const Coordinates &pos) const override;
-    size_t element_size() const override
+    size_t  element_size() const override
     {
         return data_size_from_type(_data_type) * _num_channels;
     }
@@ -262,6 +246,10 @@ public:
     {
         return _tensor_shape;
     }
+    const TensorDimsState &tensor_dims_state() const override
+    {
+        return _dims_state;
+    }
     DataType data_type() const override
     {
         return _data_type;
@@ -288,16 +276,16 @@ public:
     }
     bool is_dynamic() const override
     {
-        return _is_dynamic;
+        return std::find(std::cbegin(_dims_state), std::cend(_dims_state), get_dynamic_state_value()) !=
+               std::cend(_dims_state);
     }
-    ITensorInfo &set_is_resizable(bool is_resizable) override
+    bool are_values_constant() const override
     {
-        _is_resizable = is_resizable;
-        return *this;
+        return _are_values_constant;
     }
-    ITensorInfo &set_is_dynamic(bool is_dynamic) override
+    ITensorInfo &set_is_resizable(bool is_resizable) override
     {
-        _is_dynamic = is_dynamic;
+        _is_resizable = is_resizable;
         return *this;
     }
     ValidRegion valid_region() const override
@@ -316,6 +304,21 @@ public:
     {
         return _data_layout;
     }
+    ITensorInfo &set_are_values_constant(bool are_values_constant) override
+    {
+        _are_values_constant = are_values_constant;
+        return *this;
+    }
+    ITensorInfo::Id id() const override
+    {
+        return _id;
+    }
+    ITensorInfo &set_id(ITensorInfo::Id id) override
+    {
+        _id = id;
+        return *this;
+    }
+    inline friend bool operator==(const TensorInfo &lhs, const TensorInfo &rhs);
 
 private:
     /** Calculates strides, offset and total size resulting from the specified padding around the XY plane.
@@ -329,14 +332,37 @@ private:
     Strides          _strides_in_bytes;
     size_t           _num_channels;
     TensorShape      _tensor_shape;
+    TensorDimsState  _dims_state;
     DataType         _data_type;
     Format           _format;
     bool             _is_resizable;
-    bool             _is_dynamic;
     ValidRegion      _valid_region;
     PaddingSize      _padding;
     QuantizationInfo _quantization_info;
     DataLayout       _data_layout;
+    bool             _are_values_constant;
+    ITensorInfo::Id  _id;
+    bool             _lock_paddings;
 };
+
+/** Check whether two tensor info are equal.
+ *
+ * @param[in] lhs LHS tensor info.
+ * @param[in] rhs RHS tensor info.
+ *
+ * @return True if the given tensor infos are the same.
+ */
+inline bool operator==(const TensorInfo &lhs, const TensorInfo &rhs)
+{
+    return (lhs._total_size == rhs._total_size) &&
+           (lhs._offset_first_element_in_bytes == rhs._offset_first_element_in_bytes) &&
+           (lhs._strides_in_bytes == rhs._strides_in_bytes) && (lhs._num_channels == rhs._num_channels) &&
+           (lhs._tensor_shape == rhs._tensor_shape) && (lhs._dims_state == rhs._dims_state) &&
+           (lhs._data_type == rhs._data_type) && (lhs._format == rhs._format) &&
+           (lhs._is_resizable == rhs._is_resizable) && (lhs._valid_region == rhs._valid_region) &&
+           (lhs._padding == rhs._padding) && (lhs._quantization_info == rhs._quantization_info) &&
+           (lhs._data_layout == rhs._data_layout) && (lhs._are_values_constant == rhs._are_values_constant) &&
+           (lhs._id == rhs._id);
+}
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORINFO_H */
diff --git a/arm_compute/core/TensorShape.h b/arm_compute/core/TensorShape.h
index 57d8f6cf63..c1707e262f 100644
--- a/arm_compute/core/TensorShape.h
+++ b/arm_compute/core/TensorShape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@
 namespace arm_compute
 {
 /** Shape of a tensor */
-class TensorShape : public Dimensions<uint32_t>
+class TensorShape : public Dimensions<size_t>
 {
 public:
     /** Constructor to initialize the tensor shape.
@@ -44,11 +44,10 @@ public:
      * @param[in] dims Values to initialize the dimensions.
      */
     template <typename... Ts>
-    TensorShape(Ts... dims)
-        : Dimensions{ dims... }
+    TensorShape(Ts... dims) : Dimensions{dims...}
     {
         // Initialize unspecified dimensions to 1
-        if(_num_dimensions > 0)
+        if (_num_dimensions > 0)
         {
             std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
         }
@@ -71,14 +70,15 @@ public:
      *
      * @param[in] dimension            Dimension for which the value is set.
      * @param[in] value                Value to be set for the dimension.
-     * @param[in] apply_dim_correction Flag to state whether apply dimension correction after setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but _num_dimensions should be 3 rather than 1.
+     * @param[in] apply_dim_correction (Optional) Flag to state whether apply dimension correction after setting one dimension. E.g. when permuting NCHW -> NHWC, 1x1x2 would become 2x1x1, but _num_dimensions should be 3 rather than 1.
+     * @param[in] increase_dim_unit    (Optional) Set to true if new unit dimensions increase the number of dimensions of the shape.
      *
      * @return *this.
      */
-    TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true)
+    TensorShape &set(size_t dimension, size_t value, bool apply_dim_correction = true, bool increase_dim_unit = true)
     {
         // Clear entire shape if one dimension is zero
-        if(value == 0)
+        if (value == 0)
         {
             _num_dimensions = 0;
             std::fill(_id.begin(), _id.end(), 0);
@@ -90,10 +90,10 @@ public:
 
             // Set the specified dimension and increase the number of dimensions if
             // necessary
-            Dimensions::set(dimension, value);
+            Dimensions::set(dimension, value, increase_dim_unit);
 
             // Correct number dimensions to ignore trailing dimensions of size 1
-            if(apply_dim_correction)
+            if (apply_dim_correction)
             {
                 apply_dimension_correction();
             }
@@ -105,9 +105,10 @@ public:
      *
      * @note The upper dimensions of the tensor shape will be shifted down by 1
      *
-     * @param[in] n Dimension to remove
+     * @param[in] n                    Dimension to remove
+     * @param[in] apply_dim_correction (Optional) Flag to state whether apply dimension correction (removing trailing dimensions with size of 1) after removing a dimension.
      */
-    void remove_dimension(size_t n)
+    void remove_dimension(size_t n, bool apply_dim_correction = true)
     {
         ARM_COMPUTE_ERROR_ON(_num_dimensions < 1);
         ARM_COMPUTE_ERROR_ON(n >= _num_dimensions);
@@ -121,7 +122,10 @@ public:
         std::fill(_id.begin() + _num_dimensions, _id.end(), 1);
 
         // Correct number dimensions to ignore trailing dimensions of size 1
-        apply_dimension_correction();
+        if (apply_dim_correction)
+        {
+            apply_dimension_correction();
+        }
     }
 
     /** Collapse the first n dimensions.
@@ -207,26 +211,26 @@ public:
      * @return The broadcasted shape or an empty shape if the shapes are not broadcast compatible.
      */
     template <typename... Shapes>
-    static TensorShape broadcast_shape(const Shapes &... shapes)
+    static TensorShape broadcast_shape(const Shapes &...shapes)
     {
         TensorShape bc_shape;
 
-        auto broadcast = [&bc_shape](const TensorShape & other)
+        auto broadcast = [&bc_shape](const TensorShape &other)
         {
-            if(bc_shape.num_dimensions() == 0)
+            if (bc_shape.num_dimensions() == 0)
             {
                 bc_shape = other;
             }
-            else if(other.num_dimensions() != 0)
+            else if (other.num_dimensions() != 0)
             {
-                for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+                for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
                 {
                     const size_t dim_min = std::min(bc_shape[d], other[d]);
                     const size_t dim_max = std::max(bc_shape[d], other[d]);
 
-                    if((dim_min != 1) && (dim_min != dim_max))
+                    if ((dim_min != 1) && (dim_min != dim_max))
                     {
-                        bc_shape = TensorShape{ 0U };
+                        bc_shape = TensorShape{0U};
                         break;
                     }
 
@@ -244,9 +248,9 @@ private:
     /** Remove trailing dimensions of size 1 from the reported number of dimensions. */
     void apply_dimension_correction()
     {
-        for(int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
+        for (int i = static_cast<int>(_num_dimensions) - 1; i > 0; --i)
         {
-            if(_id[i] == 1)
+            if (_id[i] == 1)
             {
                 --_num_dimensions;
             }
@@ -257,5 +261,5 @@ private:
         }
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_TENSORSHAPE_H*/
diff --git a/arm_compute/core/TracePoint.h b/arm_compute/core/TracePoint.h
deleted file mode 100644
index 6951d6d5ef..0000000000
--- a/arm_compute/core/TracePoint.h
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TRACEPOINT_H
-#define ARM_COMPUTE_TRACEPOINT_H
-
-#include <string>
-#include <type_traits>
-#include <vector>
-
-namespace arm_compute
-{
-#ifdef ARM_COMPUTE_TRACING_ENABLED
-#define CREATE_TRACEPOINT(...) TracePoint __tp(__VA_ARGS__)
-
-/** Class used to dump configuration values in functions and kernels  */
-class TracePoint final
-{
-public:
-    /** Layer types */
-    enum class Layer
-    {
-        CORE,
-        RUNTIME
-    };
-    /** struct describing the arguments for a tracepoint */
-    struct Args final
-    {
-        std::vector<std::string> args{};
-    };
-    /** Constructor
-     *
-     * @param[in] source     type of layer for the tracepoint
-     * @param[in] class_name the name of the class creating the tracepoint
-     * @param[in] object     a pointer to the actual object owning the tracepoint
-     * @param[in] args       a struct describing all the arguments used in the call to the configure() method
-     *
-     */
-    TracePoint(Layer source, const std::string &class_name, void *object, Args &&args);
-    /** Destructor */
-    ~TracePoint();
-
-private:
-    static int g_depth; /**< current depth */
-    int        _depth;  /**< tracepoint depth */
-};
-
-/** Operator to write an argument to a @ref TracePoint
- *
- * @param[in] tp  Tracepoint to be used for writing
- * @param[in] arg Argument to be written in the tracepoint
- *
- * @return A referece to the updated tracepoint
- */
-template <typename T>
-TracePoint::Args &&operator<<(typename std::enable_if < !std::is_pointer<T>::value, TracePoint::Args >::type &&tp, const T &arg);
-template <typename T>
-TracePoint::Args &&operator<<(TracePoint::Args &&tp, const T *arg);
-
-#define CONST_REF_CLASS(type)                                             \
-    template <>                                                           \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type &arg) \
-    {                                                                     \
-        ARM_COMPUTE_UNUSED(tp);                                           \
-        tp.args.push_back(#type "(" + to_string(arg) + ")");              \
-        return std::move(tp);                                             \
-    }
-
-#define CONST_PTR_ADDRESS(type)                                           \
-    template <>                                                           \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type *arg) \
-    {                                                                     \
-        ARM_COMPUTE_UNUSED(tp);                                           \
-        tp.args.push_back(#type "*(" + to_ptr_string(arg) + ")");         \
-        return std::move(tp);                                             \
-    }
-#define CONST_PTR_CLASS(type)                                             \
-    template <>                                                           \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type *arg) \
-    {                                                                     \
-        ARM_COMPUTE_UNUSED(tp);                                           \
-        if(arg)                                                           \
-            tp.args.push_back(#type "(" + to_string(*arg) + ")");         \
-        else                                                              \
-            tp.args.push_back(#type "( nullptr )");                       \
-        return std::move(tp);                                             \
-    }
-
-#define CONST_REF_SIMPLE(type)                                               \
-    template <>                                                              \
-    TracePoint::Args &&operator<<(TracePoint::Args &&tp, const type &arg)    \
-    {                                                                        \
-        ARM_COMPUTE_UNUSED(tp);                                              \
-        tp.args.push_back(#type "(" + support::cpp11::to_string(arg) + ")"); \
-        return std::move(tp);                                                \
-    }
-
-#define TRACE_TO_STRING(type)              \
-    std::string to_string(const type &arg) \
-    {                                      \
-        ARM_COMPUTE_UNUSED(arg);           \
-        return "";                         \
-    }
-#else /* ARM_COMPUTE_TRACING_ENABLED */
-#define CREATE_TRACEPOINT(...)
-#define CONST_REF_CLASS(type)
-#define CONST_PTR_ADDRESS(type)
-#define CONST_PTR_CLASS(type)
-#define CONST_REF_SIMPLE(type)
-#define TRACE_TO_STRING(type)
-#endif /* ARM_COMPUTE_TRACING_ENABLED */
-} //namespace arm_compute
-
-#endif /* ARM_COMPUTE_TRACEPOINT_H */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 4e73edba4b..f2f60c150e 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,52 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TYPES_H
-#define ARM_COMPUTE_TYPES_H
-
+#ifndef ACL_ARM_COMPUTE_CORE_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_TYPES_H
+
+/** The following symbols have been moved to:
+ * half
+ * PermutationVector
+ * Format
+ * DataType
+ * DataLayout
+ * DataLayoutDimension
+ * PadStrideInfo
+ * WeightFormat
+ * Channel
+ * DimensionRoundingType
+ */
+#include "arm_compute/core/CoreTypes.h"
+/** The following symbols have been moved to:
+ * ActivationFunction
+ * ActivationLayerInfo
+ */
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+/** The following symbols have been moved to:
+ * ConvolutionInfo
+ */
+#include "arm_compute/function_info/ConvolutionInfo.h"
+/** The following symbols have been moved to:
+ * FullyConnectedLayerInfo
+ */
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+/** The following symbols have been moved to:
+ * GEMMLowpOutputStageType
+ * GEMMLowpOutputStageInfo
+ * GEMMInfo
+ */
+#include "arm_compute/function_info/GEMMInfo.h"
+/** The following symbols have been moved to:
+ * MatMulInfo
+ */
 #include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Strides.h"
+#include "arm_compute/core/Size3D.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
 #include "support/Bfloat16.h"
-#include "support/Half.h"
 
 #include <cmath>
 #include <cstddef>
@@ -42,62 +77,9 @@
 
 namespace arm_compute
 {
-/** 16-bit floating point type */
-using half = half_float::half;
-
-/** Permutation vector */
-using PermutationVector = Strides;
 /** Bidirectional strides */
 using BiStrides = Coordinates;
 
-/** Image colour formats */
-enum class Format
-{
-    UNKNOWN,  /**< Unknown image format */
-    U8,       /**< 1 channel, 1 U8 per channel */
-    S16,      /**< 1 channel, 1 S16 per channel */
-    U16,      /**< 1 channel, 1 U16 per channel */
-    S32,      /**< 1 channel, 1 S32 per channel */
-    U32,      /**< 1 channel, 1 U32 per channel */
-    BFLOAT16, /**< 16-bit brain floating-point number */
-    F16,      /**< 1 channel, 1 F16 per channel */
-    F32,      /**< 1 channel, 1 F32 per channel */
-    UV88,     /**< 2 channel, 1 U8 per channel */
-    RGB888,   /**< 3 channels, 1 U8 per channel */
-    RGBA8888, /**< 4 channels, 1 U8 per channel */
-    YUV444,   /**< A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes */
-    YUYV422,  /**< A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes */
-    NV12,     /**< A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling */
-    NV21,     /**< A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling */
-    IYUV,     /**< A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes */
-    UYVY422   /**< A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte */
-};
-
-/** Available data types */
-enum class DataType
-{
-    UNKNOWN,            /**< Unknown data type */
-    U8,                 /**< unsigned 8-bit number */
-    S8,                 /**< signed 8-bit number */
-    QSYMM8,             /**< quantized, symmetric fixed-point 8-bit number */
-    QASYMM8,            /**< quantized, asymmetric fixed-point 8-bit number unsigned */
-    QASYMM8_SIGNED,     /**< quantized, asymmetric fixed-point 8-bit number signed */
-    QSYMM8_PER_CHANNEL, /**< quantized, symmetric per channel fixed-point 8-bit number */
-    U16,                /**< unsigned 16-bit number */
-    S16,                /**< signed 16-bit number */
-    QSYMM16,            /**< quantized, symmetric fixed-point 16-bit number */
-    QASYMM16,           /**< quantized, asymmetric fixed-point 16-bit number */
-    U32,                /**< unsigned 32-bit number */
-    S32,                /**< signed 32-bit number */
-    U64,                /**< unsigned 64-bit number */
-    S64,                /**< signed 64-bit number */
-    BFLOAT16,           /**< 16-bit brain floating-point number */
-    F16,                /**< 16-bit floating-point number */
-    F32,                /**< 32-bit floating-point number */
-    F64,                /**< 64-bit floating-point number */
-    SIZET               /**< size_t */
-};
-
 /** Available Sampling Policies */
 enum class SamplingPolicy
 {
@@ -105,42 +87,15 @@ enum class SamplingPolicy
     TOP_LEFT /**< Samples are taken at pixel top left corner */
 };
 
-/** Constant value of the border pixels when using BorderMode::CONSTANT */
-constexpr uint8_t CONSTANT_BORDER_VALUE = 199;
-
-/** Constant value used to indicate a half-scale pyramid */
-constexpr float SCALE_PYRAMID_HALF = 0.5f;
-
-/** Constant value used to indicate a ORB scaled pyramid */
-constexpr float SCALE_PYRAMID_ORB = 8.408964152537146130583778358414e-01;
-
-/** [DataLayout enum definition] **/
-
-/** Supported tensor data layouts */
-enum class DataLayout
-{
-    UNKNOWN, /**< Unknown data layout */
-    NCHW,    /**< Num samples, channels, height, width */
-    NHWC     /**< Num samples, height, width, channels */
-};
-/** [DataLayout enum definition] **/
-
-/** Supported tensor data layout dimensions */
-enum class DataLayoutDimension
-{
-    CHANNEL, /**< channel */
-    HEIGHT,  /**< height */
-    WIDTH,   /**< width */
-    BATCHES  /**< batches */
-};
-
 /** Available ConvolutionMethod*/
 enum class ConvolutionMethod
 {
-    GEMM,     /**< Convolution using GEMM */
-    DIRECT,   /**< Direct convolution */
-    WINOGRAD, /**< Convolution using Winograd */
-    FFT       /**< Convolution using FFT */
+    GEMM,        /**< Convolution using GEMM */
+    GEMM_CONV2D, /**< Direct 2D GEMM convolution */
+    DIRECT,      /**< Direct convolution */
+    INDIRECT,    /**< Indirect convolution */
+    WINOGRAD,    /**< Convolution using Winograd */
+    FFT          /**< Convolution using FFT */
 };
 
 /** Available DepthwiseConvolutionFunction*/
@@ -153,8 +108,9 @@ enum class DepthwiseConvolutionFunction
 /** Available DeconvolutionMethod*/
 enum class DeconvolutionMethod
 {
-    GEMM,   /**< Deconvolution using GEMM */
-    DIRECT, /**< Direct deconvolution */
+    GEMM,          /**< Deconvolution using GEMM */
+    DIRECT,        /**< Direct deconvolution */
+    UPSCALE_CONV2D /**< Deconvolution with Upscaling */
 };
 
 /** Available FuseBatchNormalizationType*/
@@ -187,8 +143,7 @@ enum class ComparisonOperation
 struct ValidRegion
 {
     /** Default constructor */
-    ValidRegion()
-        : anchor{}, shape{}
+    ValidRegion() : anchor{}, shape{}
     {
     }
 
@@ -209,8 +164,7 @@ struct ValidRegion
      * @param[in] a_shape   Shape of the valid region.
      *
      */
-    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape)
-        : anchor{ an_anchor }, shape{ a_shape }
+    ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape) : anchor{an_anchor}, shape{a_shape}
     {
         anchor.set_num_dimensions(std::max(anchor.num_dimensions(), shape.num_dimensions()));
     }
@@ -223,7 +177,7 @@ struct ValidRegion
      *
      */
     ValidRegion(const Coordinates &an_anchor, const TensorShape &a_shape, size_t num_dimensions)
-        : anchor{ an_anchor }, shape{ a_shape }
+        : anchor{an_anchor}, shape{a_shape}
     {
         ARM_COMPUTE_ERROR_ON(num_dimensions < std::max(anchor.num_dimensions(), shape.num_dimensions()));
         anchor.set_num_dimensions(num_dimensions);
@@ -256,9 +210,22 @@ struct ValidRegion
         return *this;
     }
 
+    /** Check whether two valid regions are equal.
+     *
+     * @param[in] lhs LHS valid region
+     * @param[in] rhs RHS valid region
+     *
+     * @return True if the valid regions are the same.
+     */
+    inline friend bool operator==(const ValidRegion &lhs, const ValidRegion &rhs);
+
     Coordinates anchor; /**< Anchor for the start of the valid region. */
     TensorShape shape;  /**< Shape of the valid region. */
 };
+inline bool operator==(const ValidRegion &lhs, const ValidRegion &rhs)
+{
+    return (lhs.anchor == rhs.anchor) && (lhs.shape == rhs.shape);
+}
 
 /** Methods available to handle borders */
 enum class BorderMode
@@ -272,26 +239,24 @@ enum class BorderMode
 struct BorderSize
 {
     /** Empty border, i.e. no border */
-    constexpr BorderSize()
-        : top{ 0 }, right{ 0 }, bottom{ 0 }, left{ 0 }
+    constexpr BorderSize() noexcept : top{0}, right{0}, bottom{0}, left{0}
     {
     }
 
     /** Border with equal size around the 2D plane */
-    explicit constexpr BorderSize(unsigned int size)
-        : top{ size }, right{ size }, bottom{ size }, left{ size }
+    explicit constexpr BorderSize(unsigned int size) noexcept : top{size}, right{size}, bottom{size}, left{size}
     {
     }
 
     /** Border with same size for top/bottom and left/right */
     constexpr BorderSize(unsigned int top_bottom, unsigned int left_right)
-        : top{ top_bottom }, right{ left_right }, bottom{ top_bottom }, left{ left_right }
+        : top{top_bottom}, right{left_right}, bottom{top_bottom}, left{left_right}
     {
     }
 
     /** Border with different sizes */
     constexpr BorderSize(unsigned int top, unsigned int right, unsigned int bottom, unsigned int left)
-        : top{ top }, right{ right }, bottom{ bottom }, left{ left }
+        : top{top}, right{right}, bottom{bottom}, left{left}
     {
     }
 
@@ -337,6 +302,28 @@ struct BorderSize
         return size;
     }
 
+    /** Check equality with another BorderSize struct
+     *
+     * @param[in] rhs other struct to check against
+     *
+     * @return true if they are equal
+     */
+    bool operator==(const BorderSize &rhs) const
+    {
+        return (top == rhs.top) && (right == rhs.right) && (bottom == rhs.bottom) && (left == rhs.left);
+    }
+
+    /** Check non-equality with another BorderSize struct
+     *
+     * @param[in] rhs other struct to check against
+     *
+     * @return true if they are different
+     */
+    bool operator!=(const BorderSize &rhs) const
+    {
+        return !(*this == rhs);
+    }
+
     /** Limit this border size.
      *
      * @param[in] limit Border size to limit this border size to.
@@ -358,7 +345,11 @@ struct BorderSize
 /** Container for 2D padding size */
 using PaddingSize = BorderSize;
 
-/** Policy to handle overflow */
+/** Policy to handle integer overflow
+ *  @note: This is ignored by floating point operations where the overflow behavior adheres to the IEEE-754 standard
+ *         which states that in case of overflow ±infinity is returned for the round-to-nearest modes (and follows the
+ *         rounding rules for the directed rounding modes) by default.
+ */
 enum class ConvertPolicy
 {
     WRAP,    /**< Wrap around */
@@ -370,7 +361,7 @@ enum class InterpolationPolicy
 {
     NEAREST_NEIGHBOR, /**< Output values are defined to match the source pixel whose center is nearest to the sample position */
     BILINEAR,         /**< Output values are defined by bilinear interpolation between the pixels */
-    AREA,             /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
+    AREA, /**< Output values are determined by averaging the source pixels whose areas fall under the area of the destination pixel, projected onto the source image */
 };
 
 /** Bilinear Interpolation method used by LKTracker */
@@ -380,53 +371,6 @@ enum class BilinearInterpolation
     BILINEAR_SCHARR   /**< Scharr method */
 };
 
-/** Threshold mode */
-enum class ThresholdType
-{
-    BINARY, /**< Threshold with one value */
-    RANGE   /**< Threshold with two values*/
-};
-
-/** Termination criteria */
-enum class Termination
-{
-    TERM_CRITERIA_EPSILON,    /**< Terminate when within epsilon of a threshold */
-    TERM_CRITERIA_ITERATIONS, /**< Terminate after a maximum number of iterations */
-    TERM_CRITERIA_BOTH        /**< Terminate on whichever of the other conditions occurs first */
-};
-
-/** Magnitude calculation type. */
-enum class MagnitudeType
-{
-    L1NORM, /**< L1 normalization type */
-    L2NORM  /**< L2 normalization type */
-};
-
-/** Phase calculation type.
- *
- * @note When PhaseType == SIGNED, each angle is mapped to the range 0 to 255 inclusive otherwise angles between 0 and 180
- */
-enum class PhaseType
-{
-    SIGNED,  /**< Angle range: [0, 360] */
-    UNSIGNED /**< Angle range: [0, 180] */
-};
-
-/** Keypoint type */
-struct KeyPoint
-{
-    int32_t x{ 0 };               /**< X coordinates */
-    int32_t y{ 0 };               /**< Y coordinates */
-    float   strength{ 0.f };      /**< Strength of the point */
-    float   scale{ 0.f };         /**< Scale initialized to 0 by the corner detector */
-    float   orientation{ 0.f };   /**< Orientation initialized to 0 by the corner detector */
-    int32_t tracking_status{ 0 }; /**< Status initialized to 1 by the corner detector, set to 0 when the point is lost */
-    float   error{ 0.f };         /**< Tracking error initialized to 0 by the corner detector */
-};
-
-/** Internal key point */
-using InternalKeypoint = std::tuple<float, float, float>; /* x,y,strength */
-
 /** Rectangle type */
 struct Rectangle
 {
@@ -460,40 +404,6 @@ using PaddingList = std::vector<PaddingInfo>;
 /** Information to produce a tiled version of a Tensor */
 using Multiples = std::vector<uint32_t>;
 
-/** Available channels */
-enum class Channel
-{
-    UNKNOWN, /** Unknown channel format */
-    C0,      /**< First channel (used by formats with unknown channel types). */
-    C1,      /**< Second channel (used by formats with unknown channel types). */
-    C2,      /**< Third channel (used by formats with unknown channel types). */
-    C3,      /**< Fourth channel (used by formats with unknown channel types). */
-    R,       /**< Red channel. */
-    G,       /**< Green channel. */
-    B,       /**< Blue channel. */
-    A,       /**< Alpha channel. */
-    Y,       /**< Luma channel. */
-    U,       /**< Cb/U channel. */
-    V        /**< Cr/V/Value channel. */
-};
-
-/** Available matrix patterns */
-enum class MatrixPattern
-{
-    BOX,   /**< Box pattern matrix. */
-    CROSS, /**< Cross pattern matrix. */
-    DISK,  /**< Disk pattern matrix. */
-    OTHER  /**< Any other matrix pattern. */
-};
-
-/** Available non linear functions. */
-enum class NonLinearFilterFunction : unsigned
-{
-    MEDIAN = 0, /**< Non linear median filter. */
-    MIN    = 1, /**< Non linear erode. */
-    MAX    = 2, /**< Non linear dilate. */
-};
-
 /** Available reduction operations */
 enum class ReductionOperation
 {
@@ -523,13 +433,23 @@ enum class ArithmeticOperation
 /** Available element wise unary operations */
 enum class ElementWiseUnary
 {
-    RSQRT, /**< Reverse square root */
-    EXP,   /**< Exponential */
-    NEG,   /**< Negate */
-    LOG,   /**< Natural Logarithm */
-    ABS,   /**< Absolute value */
-    SIN,   /**< Sine */
-    ROUND, /**< Round */
+    RSQRT,       /**< Reverse square root */
+    EXP,         /**< Exponential */
+    NEG,         /**< Negate */
+    LOG,         /**< Natural Logarithm */
+    ABS,         /**< Absolute value */
+    SIN,         /**< Sine */
+    ROUND,       /**< Round */
+    LOGICAL_NOT, /**< Logical Not */
+};
+
+/** Available bitwise operations */
+enum class BitwiseOperation
+{
+    AND, /**< Bitwise AND operation */
+    NOT, /**< Bitwise NOT operation */
+    OR,  /**< Bitwise OR operation  */
+    XOR, /**< Bitwise XOR operation  */
 };
 
 /** The normalization type used for the normalization layer */
@@ -540,14 +460,6 @@ enum class NormType
     CROSS_MAP  /**< Normalization applied cross maps */
 };
 
-/** Normalization type for Histogram of Oriented Gradients (HOG) */
-enum class HOGNormType
-{
-    L2_NORM    = 1, /**< L2-norm */
-    L2HYS_NORM = 2, /**< L2-norm followed by clipping */
-    L1_NORM    = 3  /**< L1 norm */
-};
-
 /** Detection window used for the object detection. The detection window keeps the following information:
  *
  *  -# Geometry of the rectangular window (x/y of top-left corner and width/height)
@@ -556,21 +468,12 @@ enum class HOGNormType
  */
 struct DetectionWindow
 {
-    uint16_t x{ 0 };         /**< Top-left x coordinate */
-    uint16_t y{ 0 };         /**< Top-left y coordinate */
-    uint16_t width{ 0 };     /**< Width of the detection window */
-    uint16_t height{ 0 };    /**< Height of the detection window */
-    uint16_t idx_class{ 0 }; /**< Index of the class */
-    float    score{ 0.f };   /**< Confidence value for the detection window */
-};
-
-/** Dimension rounding type when down-scaling on CNNs
- * @note Used in pooling and convolution layer
- */
-enum class DimensionRoundingType
-{
-    FLOOR, /**< Floor rounding */
-    CEIL   /**< Ceil rounding */
+    uint16_t x{0};         /**< Top-left x coordinate */
+    uint16_t y{0};         /**< Top-left y coordinate */
+    uint16_t width{0};     /**< Width of the detection window */
+    uint16_t height{0};    /**< Height of the detection window */
+    uint16_t idx_class{0}; /**< Index of the class */
+    float    score{0.f};   /**< Confidence value for the detection window */
 };
 
 /** Available pooling types */
@@ -607,12 +510,28 @@ public:
      * @param[in] im_width                 (Optional) Boxes whose centers (on the x axis) is beyond im_width will be filtered. Defaults to 1
      * @param[in] im_height                (Optional) Boxes whose centers (on the y axis) is beyond im_height will be filtered. Defaults to 1
      */
-    BoxNMSLimitInfo(float score_thresh = 0.05f, float nms = 0.3f,
-                    int detections = 100, bool soft_nms_enabled = false,
-                    NMSType soft_nms_method = NMSType::LINEAR,
-                    float soft_nms_sigma = 0.5f, float soft_nms_min_score_thres = 0.001f, bool suppress_size = false, float min_size = 1.0f, float im_width = 1.0f, float im_height = 1.0f)
-        : _score_thresh(score_thresh), _nms(nms), _detections_per_im(detections), _soft_nms_enabled(soft_nms_enabled), _soft_nms_method(soft_nms_method), _soft_nms_sigma(soft_nms_sigma),
-          _soft_nms_min_score_thres(soft_nms_min_score_thres), _suppress_size(suppress_size), _min_size(min_size), _im_width(im_width), _im_height(im_height)
+    BoxNMSLimitInfo(float   score_thresh             = 0.05f,
+                    float   nms                      = 0.3f,
+                    int     detections               = 100,
+                    bool    soft_nms_enabled         = false,
+                    NMSType soft_nms_method          = NMSType::LINEAR,
+                    float   soft_nms_sigma           = 0.5f,
+                    float   soft_nms_min_score_thres = 0.001f,
+                    bool    suppress_size            = false,
+                    float   min_size                 = 1.0f,
+                    float   im_width                 = 1.0f,
+                    float   im_height                = 1.0f)
+        : _score_thresh(score_thresh),
+          _nms(nms),
+          _detections_per_im(detections),
+          _soft_nms_enabled(soft_nms_enabled),
+          _soft_nms_method(soft_nms_method),
+          _soft_nms_sigma(soft_nms_sigma),
+          _soft_nms_min_score_thres(soft_nms_min_score_thres),
+          _suppress_size(suppress_size),
+          _min_size(min_size),
+          _im_width(im_width),
+          _im_height(im_height)
     {
     }
     /** Get the score threshold */
@@ -686,120 +605,42 @@ private:
 };
 
 /** Padding and stride information class */
-class PadStrideInfo
+/** Padding information for 2D operations like Conv2d */
+struct Padding2D
 {
-public:
-    /** Constructor
-     *
-     * @param[in] stride_x (Optional) Stride, in elements, across x. Defaults to 1.
-     * @param[in] stride_y (Optional) Stride, in elements, across y. Defaults to 1.
-     * @param[in] pad_x    (Optional) Padding, in elements, across x. Defaults to 0.
-     * @param[in] pad_y    (Optional) Padding, in elements, across y. Defaults to 0.
-     * @param[in] round    (Optional) Dimensions rounding. Defaults to @ref FLOOR.
-     */
-    PadStrideInfo(unsigned int stride_x = 1, unsigned int stride_y = 1,
-                  unsigned int pad_x = 0, unsigned int pad_y = 0,
-                  DimensionRoundingType round = DimensionRoundingType::FLOOR)
-        : _stride(std::make_pair(stride_x, stride_y)),
-          _pad_left(pad_x),
-          _pad_top(pad_y),
-          _pad_right(pad_x),
-          _pad_bottom(pad_y),
-          _round_type(round)
+    Padding2D() = default;
+    Padding2D(size_t left, size_t right, size_t top, size_t bottom) : left(left), right(right), top(top), bottom(bottom)
     {
     }
-    /** Constructor
-     *
-     * @param[in] stride_x   Stride, in elements, across x.
-     * @param[in] stride_y   Stride, in elements, across y.
-     * @param[in] pad_left   Padding across x on the left, in elements.
-     * @param[in] pad_top    Padding across y on the top, in elements.
-     * @param[in] pad_right  Padding across x on the right, in elements.
-     * @param[in] pad_bottom Padding across y on the bottom, in elements.
-     * @param[in] round      Dimensions rounding.
-     */
-    PadStrideInfo(unsigned int stride_x, unsigned int stride_y,
-                  unsigned int pad_left, unsigned int pad_right,
-                  unsigned int pad_top, unsigned int pad_bottom,
-                  DimensionRoundingType round)
-        : _stride(std::make_pair(stride_x, stride_y)),
-          _pad_left(pad_left),
-          _pad_top(pad_top),
-          _pad_right(pad_right),
-          _pad_bottom(pad_bottom),
-          _round_type(round)
-    {
-    }
-    /** Get the stride.
-     *
-     * @return a pair: stride x, stride y.
-     */
-    std::pair<unsigned int, unsigned int> stride() const
-    {
-        return _stride;
-    }
-    /** Check whether the padding is symmetric.
-     *
-     * @return True if the padding is symmetric.
-     */
-    bool padding_is_symmetric() const
-    {
-        return (_pad_left == _pad_right) && (_pad_top == _pad_bottom);
-    }
-    /** Get the padding.
-     *
-     * @note This should only be used when the padding is symmetric.
-     *
-     * @return a pair: padding left/right, padding top/bottom
-     */
-    std::pair<unsigned int, unsigned int> pad() const
-    {
-        //this accessor should be used only when padding is symmetric
-        ARM_COMPUTE_ERROR_ON(!padding_is_symmetric());
-        return std::make_pair(_pad_left, _pad_top);
-    }
+    size_t left   = {0}; /**<  Padding across the width dimension on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimension on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimension on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimension on the bottom, in elements. */
+};
 
-    /** Get the left padding */
-    unsigned int pad_left() const
-    {
-        return _pad_left;
-    }
-    /** Get the right padding */
-    unsigned int pad_right() const
-    {
-        return _pad_right;
-    }
-    /** Get the top padding */
-    unsigned int pad_top() const
-    {
-        return _pad_top;
-    }
-    /** Get the bottom padding */
-    unsigned int pad_bottom() const
+/** Padding information for 3D operations like Conv3d */
+struct Padding3D
+{
+    Padding3D() noexcept
     {
-        return _pad_bottom;
     }
 
-    /** Get the rounding type */
-    DimensionRoundingType round() const
+    Padding3D(size_t pad_x, size_t pad_y, size_t pad_z)
+        : left(pad_x), right(pad_x), top(pad_y), bottom(pad_y), front(pad_z), back(pad_z)
     {
-        return _round_type;
     }
 
-    /** Check whether this has any padding */
-    bool has_padding() const
+    Padding3D(size_t left, size_t right, size_t top, size_t bottom, size_t front, size_t back)
+        : left(left), right(right), top(top), bottom(bottom), front(front), back(back)
     {
-        return (_pad_left != 0 || _pad_top != 0 || _pad_right != 0 || _pad_bottom != 0);
     }
 
-private:
-    std::pair<unsigned int, unsigned int> _stride;
-    unsigned int _pad_left;
-    unsigned int _pad_top;
-    unsigned int _pad_right;
-    unsigned int _pad_bottom;
-
-    DimensionRoundingType _round_type;
+    size_t left   = {0}; /**<  Padding across the width dimenstion on the left, in elements. */
+    size_t right  = {0}; /**<  Padding across the width dimenstion on the right, in elements. */
+    size_t top    = {0}; /**<  Padding across the height dimenstion  on the top, in elements. */
+    size_t bottom = {0}; /**<  Padding across the height dimenstion on the bottom, in elements. */
+    size_t front  = {0}; /**<  Padding across the depth dimenstion on the front, in elements. */
+    size_t back   = {0}; /**<  Padding across the depth dimenstion on the back, in elements. */
 };
 
 /** PriorBox layer info */
@@ -831,9 +672,15 @@ public:
      * @param[in] img_size      (Optional) Image size.
      * @param[in] steps         (Optional) Step values.
      */
-    PriorBoxLayerInfo(const std::vector<float> &min_sizes, const std::vector<float> &variances, float offset, bool flip = true, bool clip = false,
-                      const std::vector<float> &max_sizes = {}, const std::vector<float> &aspect_ratios = {},
-    const Coordinates2D &img_size = Coordinates2D{ 0, 0 }, const std::array<float, 2> &steps = { { 0.f, 0.f } })
+    PriorBoxLayerInfo(const std::vector<float>   &min_sizes,
+                      const std::vector<float>   &variances,
+                      float                       offset,
+                      bool                        flip          = true,
+                      bool                        clip          = false,
+                      const std::vector<float>   &max_sizes     = {},
+                      const std::vector<float>   &aspect_ratios = {},
+                      const Coordinates2D        &img_size      = Coordinates2D{0, 0},
+                      const std::array<float, 2> &steps         = {{0.f, 0.f}})
         : _min_sizes(min_sizes),
           _variances(variances),
           _offset(offset),
@@ -845,22 +692,22 @@ public:
           _steps(steps)
     {
         _aspect_ratios.push_back(1.);
-        for(unsigned int i = 0; i < aspect_ratios.size(); ++i)
+        for (unsigned int i = 0; i < aspect_ratios.size(); ++i)
         {
             float ar            = aspect_ratios[i];
             bool  already_exist = false;
-            for(auto ar_new : _aspect_ratios)
+            for (auto ar_new : _aspect_ratios)
             {
-                if(fabs(ar - ar_new) < 1e-6)
+                if (fabs(ar - ar_new) < 1e-6)
                 {
                     already_exist = true;
                     break;
                 }
             }
-            if(!already_exist)
+            if (!already_exist)
             {
                 _aspect_ratios.push_back(ar);
-                if(flip)
+                if (flip)
                 {
                     _aspect_ratios.push_back(1.f / ar);
                 }
@@ -914,14 +761,14 @@ public:
     }
 
 private:
-    std::vector<float> _min_sizes;
-    std::vector<float> _variances;
-    float              _offset;
-    bool               _flip;
-    bool               _clip;
-    std::vector<float> _max_sizes;
-    std::vector<float> _aspect_ratios;
-    Coordinates2D      _img_size;
+    std::vector<float>   _min_sizes;
+    std::vector<float>   _variances;
+    float                _offset;
+    bool                 _flip;
+    bool                 _clip;
+    std::vector<float>   _max_sizes;
+    std::vector<float>   _aspect_ratios;
+    Coordinates2D        _img_size;
     std::array<float, 2> _steps;
 };
 
@@ -972,8 +819,16 @@ public:
      * @param[in] variance_encoded_in_target (Optional) If true, variance is encoded in target. Otherwise we need to adjust the predicted offset accordingly.Default set to false.
      * @param[in] eta                        (Optional) Eta.
      */
-    DetectionOutputLayerInfo(int num_classes, bool share_location, DetectionOutputLayerCodeType code_type, int keep_top_k, float nms_threshold, int top_k = -1, int background_label_id = -1,
-                             float confidence_threshold = std::numeric_limits<float>::lowest(), bool variance_encoded_in_target = false, float eta = 1)
+    DetectionOutputLayerInfo(int                          num_classes,
+                             bool                         share_location,
+                             DetectionOutputLayerCodeType code_type,
+                             int                          keep_top_k,
+                             float                        nms_threshold,
+                             int                          top_k                = -1,
+                             int                          background_label_id  = -1,
+                             float                        confidence_threshold = std::numeric_limits<float>::lowest(),
+                             bool                         variance_encoded_in_target = false,
+                             float                        eta                        = 1)
         : _num_classes(num_classes),
           _share_location(share_location),
           _code_type(code_type),
@@ -1087,8 +942,15 @@ public:
      * @param[in] detection_per_class       (Optional) Number of detection per class. Used in the Regular Non-Max-Suppression. Defaults to 100.
      * @param[in] dequantize_scores         (Optional) If the scores need to be dequantized. Defaults to true.
      */
-    DetectionPostProcessLayerInfo(unsigned int max_detections, unsigned int max_classes_per_detection, float nms_score_threshold, float iou_threshold, unsigned int num_classes,
-                                  std::array<float, 4> scales_values, bool use_regular_nms = false, unsigned int detection_per_class = 100, bool dequantize_scores = true)
+    DetectionPostProcessLayerInfo(unsigned int         max_detections,
+                                  unsigned int         max_classes_per_detection,
+                                  float                nms_score_threshold,
+                                  float                iou_threshold,
+                                  unsigned int         num_classes,
+                                  std::array<float, 4> scales_values,
+                                  bool                 use_regular_nms     = false,
+                                  unsigned int         detection_per_class = 100,
+                                  bool                 dequantize_scores   = true)
         : _max_detections(max_detections),
           _max_classes_per_detection(max_classes_per_detection),
           _nms_score_threshold(nms_score_threshold),
@@ -1166,15 +1028,15 @@ public:
     }
 
 private:
-    unsigned int _max_detections;
-    unsigned int _max_classes_per_detection;
-    float        _nms_score_threshold;
-    float        _iou_threshold;
-    unsigned int _num_classes;
+    unsigned int         _max_detections;
+    unsigned int         _max_classes_per_detection;
+    float                _nms_score_threshold;
+    float                _iou_threshold;
+    unsigned int         _num_classes;
     std::array<float, 4> _scales_values;
-    bool         _use_regular_nms;
-    unsigned int _detection_per_class;
-    bool         _dequantize_scores;
+    bool                 _use_regular_nms;
+    unsigned int         _detection_per_class;
+    bool                 _dequantize_scores;
 };
 
 /** Pooling Layer Information struct*/
@@ -1188,7 +1050,9 @@ struct PoolingLayerInfo
           pad_stride_info(PadStrideInfo()),
           exclude_padding(false),
           is_global_pooling(false),
-          fp_mixed_precision(false)
+          fp_mixed_precision(false),
+          use_inf_as_limit(true),
+          use_kernel_indices(false)
     {
     }
     /** Constructor
@@ -1201,20 +1065,26 @@ struct PoolingLayerInfo
      *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
      *                               Defaults to false;
      * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] use_inf_as_limit   (Optional) Use inf to represent the limits of datatypes range, instead of  using "lowest" property of the data type.
+     * @param[in] use_kernel_indices (Optional) Use kernel indices instead of using source indices while computing indices tensor.
      */
     explicit PoolingLayerInfo(PoolingType   pool_type,
                               unsigned int  pool_size,
                               DataLayout    data_layout,
                               PadStrideInfo pad_stride_info    = PadStrideInfo(),
                               bool          exclude_padding    = false,
-                              bool          fp_mixed_precision = false)
+                              bool          fp_mixed_precision = false,
+                              bool          use_inf_as_limit   = true,
+                              bool          use_kernel_indices = false)
         : pool_type(pool_type),
           pool_size(Size2D(pool_size, pool_size)),
           data_layout(data_layout),
           pad_stride_info(pad_stride_info),
           exclude_padding(exclude_padding),
           is_global_pooling(false),
-          fp_mixed_precision(fp_mixed_precision)
+          fp_mixed_precision(fp_mixed_precision),
+          use_inf_as_limit(use_inf_as_limit),
+          use_kernel_indices(use_kernel_indices)
     {
     }
 
@@ -1228,20 +1098,26 @@ struct PoolingLayerInfo
      *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
      *                               Defaults to false;
      * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] use_inf_as_limit   (Optional) Use inf to represent the limits of datatypes range, instead of  using "lowest" property of the data type.
+     * @param[in] use_kernel_indices (Optional) Use kernel indices instead of using source indices while computing indices tensor.
      */
     explicit PoolingLayerInfo(PoolingType   pool_type,
                               Size2D        pool_size,
                               DataLayout    data_layout,
                               PadStrideInfo pad_stride_info    = PadStrideInfo(),
                               bool          exclude_padding    = false,
-                              bool          fp_mixed_precision = false)
+                              bool          fp_mixed_precision = false,
+                              bool          use_inf_as_limit   = true,
+                              bool          use_kernel_indices = false)
         : pool_type(pool_type),
           pool_size(pool_size),
           data_layout(data_layout),
           pad_stride_info(pad_stride_info),
           exclude_padding(exclude_padding),
           is_global_pooling(false),
-          fp_mixed_precision(fp_mixed_precision)
+          fp_mixed_precision(fp_mixed_precision),
+          use_inf_as_limit(use_inf_as_limit),
+          use_kernel_indices(use_kernel_indices)
     {
     }
 
@@ -1259,7 +1135,9 @@ struct PoolingLayerInfo
           pad_stride_info(PadStrideInfo(1, 1, 0, 0)),
           exclude_padding(false),
           is_global_pooling(true),
-          fp_mixed_precision(false)
+          fp_mixed_precision(false),
+          use_inf_as_limit(true),
+          use_kernel_indices(false)
     {
     }
 
@@ -1270,6 +1148,111 @@ struct PoolingLayerInfo
     bool          exclude_padding;
     bool          is_global_pooling;
     bool          fp_mixed_precision;
+    bool          use_inf_as_limit;
+    bool          use_kernel_indices;
+};
+
+/** Pooling Layer Information struct*/
+struct Pooling3dLayerInfo
+{
+    /** Default Constructor */
+    Pooling3dLayerInfo() noexcept
+        : pool_type(PoolingType::MAX),
+          pool_size(Size3D()),
+          stride(Size3D()),
+          padding(Padding3D()),
+          exclude_padding(false),
+          is_global_pooling(false),
+          fp_mixed_precision(false),
+          round_type(DimensionRoundingType::FLOOR)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] pool_type          Pooling type @ref PoolingType.
+     * @param[in] pool_size          Pooling size, in elements, across x, y and z.
+     * @param[in] stride             (Optional) stride information @ref Size3D
+     * @param[in] padding            (Optional) padding information @ref Padding3D
+     * @param[in] exclude_padding    (Optional) Strategy when accounting padding in calculations.
+     *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                               Defaults to false;
+     * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] round_type         (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR
+     */
+    explicit Pooling3dLayerInfo(PoolingType           pool_type,
+                                unsigned int          pool_size,
+                                Size3D                stride             = Size3D(1U, 1U, 1U),
+                                Padding3D             padding            = Padding3D(),
+                                bool                  exclude_padding    = false,
+                                bool                  fp_mixed_precision = false,
+                                DimensionRoundingType round_type         = DimensionRoundingType::FLOOR)
+        : pool_type(pool_type),
+          pool_size(Size3D(pool_size, pool_size, pool_size)),
+          stride(stride),
+          padding(padding),
+          exclude_padding(exclude_padding),
+          is_global_pooling(false),
+          fp_mixed_precision(fp_mixed_precision),
+          round_type(round_type)
+    {
+    }
+
+    /** Constructor
+     *
+     * @param[in] pool_type          Pooling type @ref PoolingType.
+     * @param[in] pool_size          Pooling size, in elements, across  x, y and z.
+     * @param[in] stride             (Optional) stride information @ref Size3D
+     * @param[in] padding            (Optional) padding information @ref Padding3D
+     * @param[in] exclude_padding    (Optional) Strategy when accounting padding in calculations.
+     *                               True will exclude padding while false will not (Used in AVG/L2 pooling to determine the pooling area).
+     *                               Defaults to false;
+     * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] round_type         (Optional) Dimensions rounding. Defaults to @ref DimensionRoundingType::FLOOR
+     */
+    explicit Pooling3dLayerInfo(PoolingType           pool_type,
+                                Size3D                pool_size,
+                                Size3D                stride             = Size3D(1U, 1U, 1U),
+                                Padding3D             padding            = Padding3D(),
+                                bool                  exclude_padding    = false,
+                                bool                  fp_mixed_precision = false,
+                                DimensionRoundingType round_type         = DimensionRoundingType::FLOOR)
+        : pool_type(pool_type),
+          pool_size(pool_size),
+          stride(stride),
+          padding(padding),
+          exclude_padding(exclude_padding),
+          is_global_pooling(false),
+          fp_mixed_precision(fp_mixed_precision),
+          round_type(round_type)
+    {
+    }
+
+    /** Constructor
+     *
+     * @note This constructor is used for global pooling
+     *
+     * @param[in] pool_type Pooling type @ref PoolingType.
+     */
+    explicit Pooling3dLayerInfo(PoolingType pool_type)
+        : pool_type(pool_type),
+          pool_size(Size3D()),
+          stride(Size3D(1U, 1U, 1U)),
+          padding(Padding3D(0, 0, 0)),
+          exclude_padding(false),
+          is_global_pooling(true),
+          fp_mixed_precision(false),
+          round_type(DimensionRoundingType::FLOOR)
+    {
+    }
+
+    PoolingType           pool_type;
+    Size3D                pool_size;
+    Size3D                stride;
+    Padding3D             padding;
+    bool                  exclude_padding;
+    bool                  is_global_pooling;
+    bool                  fp_mixed_precision;
+    DimensionRoundingType round_type;
 };
 
 /** ROI Pooling Layer Information class */
@@ -1283,8 +1266,14 @@ public:
      * @param[in] spatial_scale  Spatial scale to be applied to the ROI coordinates and dimensions.
      * @param[in] sampling_ratio Number of samples to include in each pooling region (if set to zero, a ceil(roi_dims/pooling_dims))
      */
-    ROIPoolingLayerInfo(unsigned int pooled_width, unsigned int pooled_height, float spatial_scale, unsigned int sampling_ratio = 0)
-        : _pooled_width(pooled_width), _pooled_height(pooled_height), _spatial_scale(spatial_scale), _sampling_ratio(sampling_ratio)
+    ROIPoolingLayerInfo(unsigned int pooled_width,
+                        unsigned int pooled_height,
+                        float        spatial_scale,
+                        unsigned int sampling_ratio = 0)
+        : _pooled_width(pooled_width),
+          _pooled_height(pooled_height),
+          _spatial_scale(spatial_scale),
+          _sampling_ratio(sampling_ratio)
     {
     }
     /** Get the pooled width of the layer */
@@ -1331,10 +1320,24 @@ public:
      * @param[in] min_size       (Optional)Size used to validate the anchors produced. Defaults to 16.
      * @param[in] values_per_roi (Optional)Values used to represent a ROI(Region of interest). Defaults to 4.
      */
-    GenerateProposalsInfo(float im_width, float im_height, float im_scale, float spatial_scale = 1.0, int pre_nms_topN = 6000, int post_nms_topN = 300, float nms_thres = 0.7, float min_size = 16.0,
+    GenerateProposalsInfo(float  im_width,
+                          float  im_height,
+                          float  im_scale,
+                          float  spatial_scale  = 1.0,
+                          int    pre_nms_topN   = 6000,
+                          int    post_nms_topN  = 300,
+                          float  nms_thres      = 0.7,
+                          float  min_size       = 16.0,
                           size_t values_per_roi = 4)
-        : _im_height(im_height), _im_width(im_width), _im_scale(im_scale), _spatial_scale(spatial_scale), _pre_nms_topN(pre_nms_topN), _post_nms_topN(post_nms_topN), _nms_thres(nms_thres),
-          _min_size(min_size), _values_per_roi(values_per_roi)
+        : _im_height(im_height),
+          _im_width(im_width),
+          _im_scale(im_scale),
+          _spatial_scale(spatial_scale),
+          _pre_nms_topN(pre_nms_topN),
+          _post_nms_topN(post_nms_topN),
+          _nms_thres(nms_thres),
+          _min_size(min_size),
+          _values_per_roi(values_per_roi)
     {
     }
 
@@ -1460,11 +1463,20 @@ public:
      * @param[in] correct_transform_coords (Optional)Correct bounding box transform coordinates. Defaults to false
      * @param[in] bbox_xform_clip          (Optional)Minimum bounding box width and height after bounding box transformation in log-space. Defaults to log(1000/16)
      */
-    BoundingBoxTransformInfo(float img_width, float img_height, float scale, bool apply_scale = false, const std::array<float, 4> weights = { { 1.f, 1.f, 1.f, 1.f } }, bool correct_transform_coords =
-    false,
-    float bbox_xform_clip =
-        4.135166556742356f)
-        : _img_width(img_width), _img_height(img_height), _scale(scale), _apply_scale(apply_scale), _correct_transform_coords(correct_transform_coords), _weights(weights), _bbox_xform_clip(bbox_xform_clip)
+    BoundingBoxTransformInfo(float                      img_width,
+                             float                      img_height,
+                             float                      scale,
+                             bool                       apply_scale              = false,
+                             const std::array<float, 4> weights                  = {{1.f, 1.f, 1.f, 1.f}},
+                             bool                       correct_transform_coords = false,
+                             float                      bbox_xform_clip          = 4.135166556742356f)
+        : _img_width(img_width),
+          _img_height(img_height),
+          _scale(scale),
+          _apply_scale(apply_scale),
+          _correct_transform_coords(correct_transform_coords),
+          _weights(weights),
+          _bbox_xform_clip(bbox_xform_clip)
     {
     }
 
@@ -1504,110 +1516,13 @@ public:
     }
 
 private:
-    float _img_width;
-    float _img_height;
-    float _scale;
-    bool  _apply_scale;
-    bool  _correct_transform_coords;
+    float                _img_width;
+    float                _img_height;
+    float                _scale;
+    bool                 _apply_scale;
+    bool                 _correct_transform_coords;
     std::array<float, 4> _weights;
-    float _bbox_xform_clip;
-};
-
-/** Activation Layer Information class */
-class ActivationLayerInfo
-{
-public:
-    /** Available activation functions */
-    enum class ActivationFunction
-    {
-        LOGISTIC,        /**< Logistic ( \f$ f(x) = \frac{1}{1 + e^{-x}} \f$ ) */
-        TANH,            /**< Hyperbolic tangent ( \f$ f(x) = a \cdot tanh(b \cdot x) \f$ ) */
-        RELU,            /**< Rectifier ( \f$ f(x) = max(0,x) \f$ ) */
-        BOUNDED_RELU,    /**< Upper Bounded Rectifier ( \f$ f(x) = min(a, max(0,x)) \f$ ) */
-        LU_BOUNDED_RELU, /**< Lower and Upper Bounded Rectifier ( \f$ f(x) = min(a, max(b,x)) \f$ ) */
-        LEAKY_RELU,      /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-        SOFT_RELU,       /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
-        ELU,             /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
-        ABS,             /**< Absolute ( \f$ f(x)= |x| \f$ ) */
-        SQUARE,          /**< Square ( \f$ f(x)= x^2 \f$ )*/
-        SQRT,            /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
-        LINEAR,          /**< Linear ( \f$ f(x)= ax + b \f$ ) */
-        IDENTITY,        /**< Identity ( \f$ f(x)= x \f$ ) */
-        HARD_SWISH       /**< Hard-swish ( \f$ f(x) = (x * relu6(x+3))/6 \f$ ) */
-    };
-
-    ActivationLayerInfo() = default;
-    /** Default Constructor
-     *
-     * @param[in] f The activation function to use.
-     * @param[in] a (Optional) The alpha parameter used by some activation functions
-     *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
-     * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::TANH).
-     */
-    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f)
-        : _act(f), _a(a), _b(b), _enabled(true)
-    {
-    }
-    /** Get the type of activation function */
-    ActivationFunction activation() const
-    {
-        return _act;
-    }
-    /** Get the alpha value */
-    float a() const
-    {
-        return _a;
-    }
-    /** Get the beta value */
-    float b() const
-    {
-        return _b;
-    }
-    /** Check if initialised */
-    bool enabled() const
-    {
-        return _enabled;
-    }
-
-private:
-    ActivationFunction _act     = { ActivationLayerInfo::ActivationFunction::IDENTITY };
-    float              _a       = {};
-    float              _b       = {};
-    bool               _enabled = { false };
-};
-
-/** Fully connected layer info */
-struct FullyConnectedLayerInfo
-{
-    DataLayout          weights_trained_layout{ DataLayout::NCHW }; /**<  Layout that the weights have been trained with. */
-    bool                transpose_weights{ true };                  /**<  Transpose weights if true. */
-    bool                are_weights_reshaped{ false };              /**<  Reshape the weights tensor if false. */
-    bool                retain_internal_weights{ false };           /**<  Retain internal reshaped weights. */
-    bool                fp_mixed_precision{ false };                /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
-    ActivationLayerInfo activation_info{};                          /**<  Fused activation to apply after the matrix multiplication. */
-
-    /** Sets the weights trained data layout
-     *
-     * @param[in] layout Data layout that the weights were trained with
-     *
-     * @return Updated object
-     */
-    FullyConnectedLayerInfo &set_weights_trained_layout(DataLayout layout)
-    {
-        weights_trained_layout = layout;
-        return *this;
-    }
-    /** Sets the transpose weights flag
-     *
-     * @param[in] should_transpose_weights Boolean flag indicating if weights should be transposed
-     *
-     * @return Updated object
-     */
-    FullyConnectedLayerInfo &set_transpose_weights(bool should_transpose_weights)
-    {
-        transpose_weights = should_transpose_weights;
-        return *this;
-    }
+    float                _bbox_xform_clip;
 };
 
 /** Normalization Layer Information class */
@@ -1624,7 +1539,12 @@ public:
      * @param[in] is_scaled (Optional) Boolean that specifies if alpha will be scaled by the normalization size or not.
      *                      Should be false to follow [Krichevksy 2012].
      */
-    NormalizationLayerInfo(NormType type, uint32_t norm_size = 5, float alpha = 0.0001f, float beta = 0.5f, float kappa = 1.f, bool is_scaled = true)
+    NormalizationLayerInfo(NormType type,
+                           uint32_t norm_size = 5,
+                           float    alpha     = 0.0001f,
+                           float    beta      = 0.5f,
+                           float    kappa     = 1.f,
+                           bool     is_scaled = true)
         : _type(type), _norm_size(norm_size), _alpha(alpha), _beta(beta), _kappa(kappa), _is_scaled(is_scaled)
     {
     }
@@ -1690,13 +1610,74 @@ private:
     bool     _is_scaled;
 };
 
+class StridedSliceLayerInfo
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     */
+    StridedSliceLayerInfo(int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0)
+        : _begin_mask(begin_mask), _end_mask(end_mask), _shrink_axis_mask(shrink_axis_mask)
+    {
+    }
+
+    /* Get the begin mask value */
+    int32_t begin_mask() const
+    {
+        return _begin_mask;
+    }
+
+    /* Get the end mask value */
+    int32_t end_mask() const
+    {
+        return _end_mask;
+    }
+
+    /* Get the shrink axis mask value */
+    int32_t shrink_axis_mask() const
+    {
+        return _shrink_axis_mask;
+    }
+
+private:
+    int32_t _begin_mask;
+    int32_t _end_mask;
+    int32_t _shrink_axis_mask;
+};
+
+// OHWIo<interleave_by>i<block_by>
+inline int interleave_by(const WeightFormat wf)
+{
+    return (static_cast<int>(wf) >> 8) & 0xFFF;
+}
+inline int block_by(const WeightFormat wf)
+{
+    return (static_cast<int>(wf) >> 20) & 0xF;
+}
+inline bool is_fixed_format(const WeightFormat &wf)
+{
+    return wf != WeightFormat::UNSPECIFIED && wf != WeightFormat::ANY;
+}
+inline bool is_fixed_format_fast_math(const WeightFormat &wf)
+{
+    return (static_cast<int>(wf) >> 4) & 0x1;
+}
+
 /** Convolution Layer Weights Information class. This class stores the necessary information to compute convolution layer when the weights are already reshaped */
 class WeightsInfo
 {
 public:
     /** Default constructor */
     WeightsInfo()
-        : _are_reshaped(false), _kernel_width(0), _kernel_height(0), _num_kernels(0), _retain_internal_weights(false)
+        : _are_reshaped(false),
+          _kernel_width(0),
+          _kernel_height(0),
+          _num_kernels(0),
+          _retain_internal_weights(false),
+          _weight_format(arm_compute::WeightFormat::UNSPECIFIED)
     {
     }
     /** Constructor
@@ -1706,9 +1687,20 @@ public:
      * @param[in] kernel_height           Kernel height.
      * @param[in] num_kernels             Number of convolution kernels.
      * @param[in] retain_internal_weights (Optional) True if internal reshaped weights must be retained. Used for reconfiguration purposes. Default is false.
+     * @param[in] weight_format           (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
      */
-    WeightsInfo(bool are_reshaped, unsigned int kernel_width, unsigned int kernel_height, unsigned int num_kernels, bool retain_internal_weights = false)
-        : _are_reshaped(are_reshaped), _kernel_width(kernel_width), _kernel_height(kernel_height), _num_kernels(num_kernels), _retain_internal_weights(retain_internal_weights)
+    WeightsInfo(bool                      are_reshaped,
+                unsigned int              kernel_width,
+                unsigned int              kernel_height,
+                unsigned int              num_kernels,
+                bool                      retain_internal_weights = false,
+                arm_compute::WeightFormat weight_format           = arm_compute::WeightFormat::UNSPECIFIED)
+        : _are_reshaped(are_reshaped),
+          _kernel_width(kernel_width),
+          _kernel_height(kernel_height),
+          _num_kernels(num_kernels),
+          _retain_internal_weights(retain_internal_weights),
+          _weight_format(weight_format)
     {
     }
     /** Flag which specifies if the weights tensor has been reshaped.
@@ -1739,22 +1731,40 @@ public:
     {
         return _retain_internal_weights;
     }
+    arm_compute::WeightFormat weight_format() const
+    {
+        return _weight_format;
+    }
+    void set_weight_format(arm_compute::WeightFormat weight_format)
+    {
+        _weight_format = weight_format;
+    }
+
+    unsigned int kernel_width() const
+    {
+        return _kernel_width;
+    }
+    unsigned int kernel_height() const
+    {
+        return _kernel_height;
+    }
 
 private:
-    const bool         _are_reshaped;
-    const unsigned int _kernel_width;
-    const unsigned int _kernel_height;
-    const unsigned int _num_kernels;
-    const bool         _retain_internal_weights;
+    bool                      _are_reshaped;
+    unsigned int              _kernel_width;
+    unsigned int              _kernel_height;
+    unsigned int              _num_kernels;
+    bool                      _retain_internal_weights;
+    arm_compute::WeightFormat _weight_format;
 };
 
 /** GEMM reshape information class. This class stores the necessary information about matrix A and matrix B reshape.
  *
- * The matrix A can only be reshaped through @ref CLGEMMReshapeLHSMatrixKernel or  @ref NEGEMMInterleave4x4Kernel or  @ref GCGEMMInterleave4x4Kernel
- * Note: Optionally just for @ref CLGEMMReshapeLHSMatrixKernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
+ * The matrix A can only be reshaped through @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel or  @ref cpu::kernels::CpuGemmInterleave4x4Kernel
+ * Note: Optionally just for @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel is it possible to set mult_interleave4x4_height, the multiplication factor for the height of the 4x4 interleaved block
  *
- * The matrix B can only be reshaped through @ref CLGEMMReshapeRHSMatrixKernel or  @ref NEGEMMTranspose1xWKernel or  @ref GCGEMMTranspose1xWKernel
- * Note: Optionally just for @ref CLGEMMReshapeRHSMatrixKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
+ * The matrix B can only be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel or  @ref cpu::kernels::CpuGemmTranspose1xWKernel
+ * Note: Optionally just for @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel is it possible to set mult_transpose1xW_width, the multiplication factor for the width of the 1xW transposed block
  *
  */
 class GEMMReshapeInfo final
@@ -1762,7 +1772,14 @@ class GEMMReshapeInfo final
 public:
     /** Default constructor */
     GEMMReshapeInfo()
-        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false), _broadcast_bias(false)
+        : _m(1),
+          _n(1),
+          _k(1),
+          _mult_transpose1xW_width(1),
+          _mult_interleave4x4_height(1),
+          _depth_output_gemm3d(0),
+          _reinterpret_input_as_3d(false),
+          _broadcast_bias(false)
     {
     }
     /** Constructor
@@ -1778,9 +1795,22 @@ public:
      *                                      to perform 1x1 convolutions with the NHWC data layout)
      * @param[in] broadcast_bias            (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
      */
-    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool broadcast_bias = false)
-        : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d),
-          _reinterpret_input_as_3d(reinterpret_input_as_3d), _broadcast_bias(broadcast_bias)
+    GEMMReshapeInfo(int  m,
+                    int  n,
+                    int  k,
+                    int  mult_transpose1xW_width   = 1,
+                    int  mult_interleave4x4_height = 1,
+                    int  depth_output_gemm3d       = 0,
+                    bool reinterpret_input_as_3d   = false,
+                    bool broadcast_bias            = false)
+        : _m(m),
+          _n(n),
+          _k(k),
+          _mult_transpose1xW_width(mult_transpose1xW_width),
+          _mult_interleave4x4_height(mult_interleave4x4_height),
+          _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d),
+          _broadcast_bias(broadcast_bias)
     {
     }
     /** Number of matrix A rows
@@ -1852,45 +1882,14 @@ public:
     };
 
 private:
-    const int  _m;
-    const int  _n;
-    const int  _k;
-    const int  _mult_transpose1xW_width;
-    const int  _mult_interleave4x4_height;
-    const int  _depth_output_gemm3d;
-    const bool _reinterpret_input_as_3d;
-    const bool _broadcast_bias;
-};
-
-struct DepthwiseConvolutionReshapeInfo
-{
-    unsigned int c0{ 1 };            /**< Number of channels processed by the depth-wise convolution */
-    bool         transpose{ false }; /**< True if the block MxC0 (where M is the area of the filter i.e. KwxKh) has to be transposed */
-};
-
-/** GEMMLowp output stage type */
-enum class GEMMLowpOutputStageType
-{
-    NONE,                     /**< No quantization */
-    QUANTIZE_DOWN,            /**< Quantize using an integer multiplication */
-    QUANTIZE_DOWN_FIXEDPOINT, /**< Quantize using a fixed point multiplication */
-    QUANTIZE_DOWN_FLOAT       /**< Quantize using a floating point multiplication */
-};
-
-/** GEMMLowp output stage info */
-struct GEMMLowpOutputStageInfo
-{
-    GEMMLowpOutputStageType type{ GEMMLowpOutputStageType::NONE };                        /**< GEMMLowp output stage type */
-    int32_t                 gemmlowp_offset{ 0 };                                         /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_multiplier{ 0 };                                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    int32_t                 gemmlowp_shift{ 0 };                                          /**< GEMMLowp output stage shift used for quantizing to uint8 */
-    int32_t                 gemmlowp_min_bound{ std::numeric_limits<int32_t>::lowest() }; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
-    int32_t                 gemmlowp_max_bound{ std::numeric_limits<int32_t>::max() };    /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_multipliers{};                                       /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    std::vector<int32_t>    gemmlowp_shifts{};                                            /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
-    float                   gemmlowp_real_multiplier{ 0 };                                /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
-    bool                    is_quantized_per_channel{ false };                            /**< GEMMLowp quantized per-channel flag */
-    DataType                output_data_type{ DataType::UNKNOWN };                        /**< Output tensor data type to use if the output is not initialized */
+    int  _m;
+    int  _n;
+    int  _k;
+    int  _mult_transpose1xW_width;
+    int  _mult_interleave4x4_height;
+    int  _depth_output_gemm3d;
+    bool _reinterpret_input_as_3d;
+    bool _broadcast_bias;
 };
 
 /** GEMM LHS (Left Hand Side) matrix information */
@@ -1901,211 +1900,31 @@ struct GEMMLHSMatrixInfo
         : m0(m), k0(k), v0(v), transpose(trans), interleave(inter)
     {
     }
-    unsigned int m0{ 1 };            /**< Number of rows processed by the matrix multiplication */
-    unsigned int k0{ 1 };            /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    bool         transpose{ true };  /**< True if the (m0xk0) block has to be transposed before been stored */
-    bool         interleave{ true }; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    unsigned int m0{1};            /**< Number of rows processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1};            /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (m0xk0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
 };
 
 /** GEMM RHS (Right Hand Side) matrix information */
 struct GEMMRHSMatrixInfo
 {
     GEMMRHSMatrixInfo() = default;
-    GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter)
-        : n0(n), k0(k), h0(h), transpose(trans), interleave(inter)
+    GEMMRHSMatrixInfo(unsigned int n, unsigned int k, unsigned int h, bool trans, bool inter, bool export_to_cl_img)
+        : n0(n), k0(k), h0(h), transpose(trans), interleave(inter), export_to_cl_image(export_to_cl_img)
     {
     }
-    unsigned int n0{ 1 };                     /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                     /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                     /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         transpose{ true };           /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         interleave{ true };          /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         export_to_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int n0{1};            /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1};            /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1};            /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         transpose{true};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool         interleave{true}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         export_to_cl_image{
+        false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
-/** GEMM information class. This class stores the necessary information to compute GEMM functions
- *
- * This object also contains the information about how matrix A and matrix B have been reshaped
- *
- */
-class GEMMInfo
-{
-public:
-    /** Default constructor */
-    GEMMInfo() noexcept
-        : _is_a_reshaped(false),
-          _is_b_reshaped(false),
-          _reshape_b_only_on_first_run(true),
-          _depth_output_gemm3d(0),
-          _reinterpret_input_as_3d(false),
-          _retain_internal_weights(false),
-          _gemmlowp_output_stage(),
-          _fp_mixed_precision(false),
-          _broadcast_bias(false),
-          _pretranpose_B(true),
-          _activation_info()
-    {
-    }
-    /** Constructor
-     *
-     * @param[in] is_a_reshaped               True if the matrix A has been reshaped
-     * @param[in] is_b_reshaped               True if the matrix B has been reshaped
-     * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
-     * @param[in] depth_output_gemm3d         (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
-     *                                        If 0 the output will not be reinterpreted as 3D. Default 0
-     * @param[in] reinterpret_input_as_3d     (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
-     *                                        to perform 1x1 convolutions with the NHWC data layout)
-     * @param[in] retain_internal_weights     (Optional) Retain the weights tensor from previous run
-     * @param[in] gemmlowp_output_stage       (Optional) GEMMLowp Output stage info
-     * @param[in] fp_mixed_precision          (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
-     * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
-     * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
-     */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
-             GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo(), bool fp_mixed_precision = false, bool broadcast_bias = false,
-             const ActivationLayerInfo &activation_info = ActivationLayerInfo()) noexcept
-        : _is_a_reshaped(is_a_reshaped),
-          _is_b_reshaped(is_b_reshaped),
-          _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
-          _depth_output_gemm3d(depth_output_gemm3d),
-          _reinterpret_input_as_3d(reinterpret_input_as_3d),
-          _retain_internal_weights(retain_internal_weights),
-          _gemmlowp_output_stage(gemmlowp_output_stage),
-          _fp_mixed_precision(fp_mixed_precision),
-          _broadcast_bias(broadcast_bias),
-          _pretranpose_B(reshape_b_only_on_first_run),
-          _activation_info(activation_info)
-    {
-    }
-    /** Flag which specifies if the matrix A has been reshaped
-     *
-     * @return True if the matrix A has been reshaped
-     */
-    bool is_a_reshaped() const
-    {
-        return _is_a_reshaped;
-    };
-    /** Flag which specifies if the matrix B has been reshaped
-     *
-     * @return True if the matrix B has been reshaped
-     */
-    bool is_b_reshaped() const
-    {
-        return _is_b_reshaped;
-    };
-    /** Flag which specifies if the reshape of matrix B should executed only for the first
-     *
-     * @note This flag could be set to TRUE when GEMM is used to accelerate convolution layer
-     *
-     * @return True if the reshaped of matrix B happens only for the first run
-     */
-    bool reshape_b_only_on_first_run() const
-    {
-        return _reshape_b_only_on_first_run;
-    };
-    /** Depth of the output when GEMM output is reinterpreted as 3D tensor
-     *
-     * @return the depth of the output tensor
-     */
-    int depth_output_gemm3d() const
-    {
-        return _depth_output_gemm3d;
-    };
-    /** Flag which specifies if the input tensor has to be reinterpreted as 3D
-     *
-     * @return True if the input tensor has to be reinterpreted as 3D tensor
-     */
-    bool reinterpret_input_as_3d() const
-    {
-        return _reinterpret_input_as_3d;
-    };
-    /** Flag which specifies if the weights tensor has to be retained from previous run
-     *
-     * @return True if the weights tensor has to be retained
-     */
-    bool retain_internal_weights() const
-    {
-        return _retain_internal_weights;
-    };
-    /** GEMMLowp output stage
-     *
-     * @return the GEMMLowp output stage info
-     */
-    GEMMLowpOutputStageInfo gemmlowp_output_stage() const
-    {
-        return _gemmlowp_output_stage;
-    };
-    /** Sets GEMMLowp output stage
-     *
-     * @param[in] output_stage Output stage to set
-     */
-    void set_gemmlowp_output_stage(GEMMLowpOutputStageInfo &output_stage)
-    {
-        _gemmlowp_output_stage = output_stage;
-    };
-    /** Flag which specifies if a wider accumulator should be used.
-     *
-     * @return True if a wider accumulator has to be used
-     */
-    bool fp_mixed_precision() const
-    {
-        return _fp_mixed_precision;
-    };
-    /** Flag which specifies whether to broadcast the shape of the bias tensor.
-     *
-     * @return True if the shape of the bias tensor is to be broadcasted.
-     */
-    bool broadcast_bias() const
-    {
-        return _broadcast_bias;
-    };
-    /** Flag which specifies whether b should be pre-transposed if supported.
-     *
-     * @return True if b should be pre-transposed else false.
-     */
-    bool pretranpose_B() const
-    {
-        return _pretranpose_B;
-    };
-    /** Set pre-transpose b flag
-     *
-     * @param[in] flag Flag to set
-     */
-    void set_pretranpose_B(bool flag)
-    {
-        _pretranpose_B = flag;
-    }
-    /** Activation layer to apply after the matrix multiplication
-     *
-     * @return ActivationLayerInfo object
-     */
-    ActivationLayerInfo activation_info() const
-    {
-        return _activation_info;
-    }
-    /** Set activation layer info
-     *
-     * @param[in] activation_info ActivationLayerInfo object to set
-     */
-    void set_activation_info(const ActivationLayerInfo &activation_info)
-    {
-        _activation_info = activation_info;
-    }
-
-private:
-    bool                    _is_a_reshaped;
-    bool                    _is_b_reshaped;
-    bool                    _reshape_b_only_on_first_run;
-    int                     _depth_output_gemm3d;
-    bool                    _reinterpret_input_as_3d;
-    bool                    _retain_internal_weights;
-    GEMMLowpOutputStageInfo _gemmlowp_output_stage;
-    bool                    _fp_mixed_precision;
-    bool                    _broadcast_bias;
-    bool                    _pretranpose_B;
-    ActivationLayerInfo     _activation_info;
-};
+class ITensorInfo;
 
 /** Winograd information */
 struct WinogradInfo
@@ -2118,16 +1937,23 @@ struct WinogradInfo
      * @param[in] conv_info      Convolution info (Pads, strides)
      * @param[in] data_layout    Data layout to use for the output tensor once the convolution has been applied
      */
-    WinogradInfo(Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
-        : output_tile_size(output_tile_sz), kernel_size(kernel_sz), input_dimensions(input_dims), convolution_info(conv_info), output_data_layout(data_layout)
-    {
-    }
-
-    Size2D        output_tile_size{};                     /**< Width and height of the output tile */
-    Size2D        kernel_size{};                          /**< Width and height of the kernel*/
-    Size2D        input_dimensions{};                     /**< Width and height of the input tensor before the convolution is applied */
-    PadStrideInfo convolution_info{};                     /**< Convolution info (Pads, strides,...) */
-    DataLayout    output_data_layout{ DataLayout::NCHW }; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
+    WinogradInfo(
+        Size2D output_tile_sz, Size2D kernel_sz, Size2D input_dims, PadStrideInfo conv_info, DataLayout data_layout)
+        : output_tile_size(output_tile_sz),
+          kernel_size(kernel_sz),
+          input_dimensions(input_dims),
+          convolution_info(conv_info),
+          output_data_layout(data_layout)
+    {
+    }
+
+    Size2D        output_tile_size{}; /**< Width and height of the output tile */
+    Size2D        kernel_size{};      /**< Width and height of the kernel*/
+    Size2D        input_dimensions{}; /**< Width and height of the input tensor before the convolution is applied */
+    PadStrideInfo convolution_info{}; /**< Convolution info (Pads, strides,...) */
+    DataLayout    output_data_layout{
+        DataLayout::
+            NCHW}; /**< Data layout to use for the output tensor once the convolution has been applied (NCHW or NHWC) */
 };
 
 /** IO formatting information class*/
@@ -2186,5 +2012,8 @@ struct IOFormatInfo
     /** Align columns */
     bool align_columns;
 };
+
+/** Class for holding information related to cropping */
+using CropInfo = Padding2D;
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_TYPES_H */
+#endif // ACL_ARM_COMPUTE_CORE_TYPES_H
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index eff6157b1f..a2146522f7 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,63 +26,29 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Version.h"
 
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <iomanip>
+#include <cmath>
 #include <numeric>
 #include <sstream>
 #include <string>
 #include <type_traits>
+#include <unordered_map>
 #include <utility>
-#include <vector>
 
-namespace arm_compute
-{
-/** Calculate the rounded up quotient of val / m.
- *
- * @param[in] val Value to divide and round up.
- * @param[in] m   Value to divide by.
- *
- * @return the result.
- */
-template <typename S, typename T>
-constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
-{
-    return (val + m - 1) / m;
-}
-
-/** Computes the smallest number larger or equal to value that is a multiple of divisor.
- *
- * @param[in] value   Lower bound value
- * @param[in] divisor Value to compute multiple of.
- *
- * @return the result.
- */
-template <typename S, typename T>
-inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
-{
-    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
-    return DIV_CEIL(value, divisor) * divisor;
-}
+/* Convenience / backwards compatibility includes */
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/FormatUtils.h"
+#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
-/** Computes the largest number smaller or equal to value that is a multiple of divisor.
- *
- * @param[in] value   Upper bound value
- * @param[in] divisor Value to compute multiple of.
- *
- * @return the result.
- */
-template <typename S, typename T>
-inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor) * divisor)
+namespace arm_compute
 {
-    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
-    return (value / divisor) * divisor;
-}
+class ITensor;
+class ITensorInfo;
+class ActivationLayerInfo;
 
 /** Load an entire file in memory
  *
@@ -93,814 +59,6 @@ inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor)
  */
 std::string read_file(const std::string &filename, bool binary);
 
-/** The size in bytes of the data type
- *
- * @param[in] data_type Input data type
- *
- * @return The size in bytes of the data type
- */
-inline size_t data_size_from_type(DataType data_type)
-{
-    switch(data_type)
-    {
-        case DataType::U8:
-        case DataType::S8:
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-            return 1;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-        case DataType::BFLOAT16:
-        case DataType::F16:
-            return 2;
-        case DataType::F32:
-        case DataType::U32:
-        case DataType::S32:
-            return 4;
-        case DataType::F64:
-        case DataType::U64:
-        case DataType::S64:
-            return 8;
-        case DataType::SIZET:
-            return sizeof(size_t);
-        default:
-            ARM_COMPUTE_ERROR("Invalid data type");
-            return 0;
-    }
-}
-
-/** The size in bytes of the pixel format
- *
- * @param[in] format Input format
- *
- * @return The size in bytes of the pixel format
- */
-inline size_t pixel_size_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-            return 1;
-        case Format::U16:
-        case Format::S16:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::UV88:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 2;
-        case Format::RGB888:
-            return 3;
-        case Format::RGBA8888:
-            return 4;
-        case Format::U32:
-        case Format::S32:
-        case Format::F32:
-            return 4;
-        //Doesn't make sense for planar formats:
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        case Format::YUV444:
-        default:
-            ARM_COMPUTE_ERROR("Undefined pixel size for given format");
-            return 0;
-    }
-}
-
-/** The size in bytes of the data type
- *
- * @param[in] dt Input data type
- *
- * @return The size in bytes of the data type
- */
-inline size_t element_size_from_data_type(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::S8:
-        case DataType::U8:
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-            return 1;
-        case DataType::U16:
-        case DataType::S16:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-        case DataType::BFLOAT16:
-        case DataType::F16:
-            return 2;
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            return 4;
-        default:
-            ARM_COMPUTE_ERROR("Undefined element size for given data type");
-            return 0;
-    }
-}
-
-/** Return the data type used by a given single-planar pixel format
- *
- * @param[in] format Input format
- *
- * @return The size in bytes of the pixel format
- */
-inline DataType data_type_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-        case Format::UV88:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return DataType::U8;
-        case Format::U16:
-            return DataType::U16;
-        case Format::S16:
-            return DataType::S16;
-        case Format::U32:
-            return DataType::U32;
-        case Format::S32:
-            return DataType::S32;
-        case Format::BFLOAT16:
-            return DataType::BFLOAT16;
-        case Format::F16:
-            return DataType::F16;
-        case Format::F32:
-            return DataType::F32;
-        //Doesn't make sense for planar formats:
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        case Format::YUV444:
-        default:
-            ARM_COMPUTE_ERROR("Not supported data_type for given format");
-            return DataType::UNKNOWN;
-    }
-}
-
-/** Return the plane index of a given channel given an input format.
- *
- * @param[in] format  Input format
- * @param[in] channel Input channel
- *
- * @return The plane index of the specific channel of the specific format
- */
-inline int plane_idx_from_channel(Format format, Channel channel)
-{
-    switch(format)
-    {
-        // Single planar formats have a single plane
-        case Format::U8:
-        case Format::U16:
-        case Format::S16:
-        case Format::U32:
-        case Format::S32:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::F32:
-        case Format::UV88:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 0;
-        // Multi planar formats
-        case Format::NV12:
-        case Format::NV21:
-        {
-            // Channel U and V share the same plane of format UV88
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                case Channel::V:
-                    return 1;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::IYUV:
-        case Format::YUV444:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 1;
-                case Channel::V:
-                    return 2;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported format");
-            return 0;
-    }
-}
-
-/** Return the channel index of a given channel given an input format.
- *
- * @param[in] format  Input format
- * @param[in] channel Input channel
- *
- * @return The channel index of the specific channel of the specific format
- */
-inline int channel_idx_from_format(Format format, Channel channel)
-{
-    switch(format)
-    {
-        case Format::RGB888:
-        {
-            switch(channel)
-            {
-                case Channel::R:
-                    return 0;
-                case Channel::G:
-                    return 1;
-                case Channel::B:
-                    return 2;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::RGBA8888:
-        {
-            switch(channel)
-            {
-                case Channel::R:
-                    return 0;
-                case Channel::G:
-                    return 1;
-                case Channel::B:
-                    return 2;
-                case Channel::A:
-                    return 3;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::YUYV422:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 1;
-                case Channel::V:
-                    return 3;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::UYVY422:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 1;
-                case Channel::U:
-                    return 0;
-                case Channel::V:
-                    return 2;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::NV12:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 0;
-                case Channel::V:
-                    return 1;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::NV21:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 1;
-                case Channel::V:
-                    return 0;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        case Format::YUV444:
-        case Format::IYUV:
-        {
-            switch(channel)
-            {
-                case Channel::Y:
-                    return 0;
-                case Channel::U:
-                    return 0;
-                case Channel::V:
-                    return 0;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported channel");
-                    return 0;
-            }
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported format");
-            return 0;
-    }
-}
-
-/** Return the number of planes for a given format
- *
- * @param[in] format Input format
- *
- * @return The number of planes for a given image format.
- */
-inline size_t num_planes_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-        case Format::S16:
-        case Format::U16:
-        case Format::S32:
-        case Format::U32:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::F32:
-        case Format::RGB888:
-        case Format::RGBA8888:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 1;
-        case Format::NV12:
-        case Format::NV21:
-            return 2;
-        case Format::IYUV:
-        case Format::YUV444:
-            return 3;
-        default:
-            ARM_COMPUTE_ERROR("Not supported format");
-            return 0;
-    }
-}
-
-/** Return the number of channels for a given single-planar pixel format
- *
- * @param[in] format Input format
- *
- * @return The number of channels for a given image format.
- */
-inline size_t num_channels_from_format(Format format)
-{
-    switch(format)
-    {
-        case Format::U8:
-        case Format::U16:
-        case Format::S16:
-        case Format::U32:
-        case Format::S32:
-        case Format::BFLOAT16:
-        case Format::F16:
-        case Format::F32:
-            return 1;
-        // Because the U and V channels are subsampled
-        // these formats appear like having only 2 channels:
-        case Format::YUYV422:
-        case Format::UYVY422:
-            return 2;
-        case Format::UV88:
-            return 2;
-        case Format::RGB888:
-            return 3;
-        case Format::RGBA8888:
-            return 4;
-        //Doesn't make sense for planar formats:
-        case Format::NV12:
-        case Format::NV21:
-        case Format::IYUV:
-        case Format::YUV444:
-        default:
-            return 0;
-    }
-}
-
-/** Return the promoted data type of a given data type.
- *
- * @note If promoted data type is not supported an error will be thrown
- *
- * @param[in] dt Data type to get the promoted type of.
- *
- * @return Promoted data type
- */
-inline DataType get_promoted_data_type(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::U8:
-            return DataType::U16;
-        case DataType::S8:
-            return DataType::S16;
-        case DataType::U16:
-            return DataType::U32;
-        case DataType::S16:
-            return DataType::S32;
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-        case DataType::BFLOAT16:
-        case DataType::F16:
-        case DataType::U32:
-        case DataType::S32:
-        case DataType::F32:
-            ARM_COMPUTE_ERROR("Unsupported data type promotions!");
-        default:
-            ARM_COMPUTE_ERROR("Undefined data type!");
-    }
-    return DataType::UNKNOWN;
-}
-
-/** Compute the mininum and maximum values a data type can take
- *
- * @param[in] dt Data type to get the min/max bounds of
- *
- * @return A tuple (min,max) with the minimum and maximum values respectively wrapped in PixelValue.
- */
-inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
-{
-    PixelValue min{};
-    PixelValue max{};
-    switch(dt)
-    {
-        case DataType::U8:
-        case DataType::QASYMM8:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()));
-            break;
-        }
-        case DataType::S8:
-        case DataType::QSYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::max()));
-            break;
-        }
-        case DataType::U16:
-        case DataType::QASYMM16:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::max()));
-            break;
-        }
-        case DataType::S16:
-        case DataType::QSYMM16:
-        {
-            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::lowest()));
-            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
-            break;
-        }
-        case DataType::U32:
-        {
-            min = PixelValue(std::numeric_limits<uint32_t>::lowest());
-            max = PixelValue(std::numeric_limits<uint32_t>::max());
-            break;
-        }
-        case DataType::S32:
-        {
-            min = PixelValue(std::numeric_limits<int32_t>::lowest());
-            max = PixelValue(std::numeric_limits<int32_t>::max());
-            break;
-        }
-        case DataType::BFLOAT16:
-        {
-            min = PixelValue(bfloat16::lowest());
-            max = PixelValue(bfloat16::max());
-            break;
-        }
-        case DataType::F16:
-        {
-            min = PixelValue(std::numeric_limits<half>::lowest());
-            max = PixelValue(std::numeric_limits<half>::max());
-            break;
-        }
-        case DataType::F32:
-        {
-            min = PixelValue(std::numeric_limits<float>::lowest());
-            max = PixelValue(std::numeric_limits<float>::max());
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Undefined data type!");
-    }
-    return std::make_tuple(min, max);
-}
-
-/** Return true if the given format has horizontal subsampling.
- *
- * @param[in] format Format to determine subsampling.
- *
- * @return True if the format can be subsampled horizontaly.
- */
-inline bool has_format_horizontal_subsampling(Format format)
-{
-    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
-}
-
-/** Return true if the given format has vertical subsampling.
- *
- * @param[in] format Format to determine subsampling.
- *
- * @return True if the format can be subsampled verticaly.
- */
-inline bool has_format_vertical_subsampling(Format format)
-{
-    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88) ? true : false;
-}
-
-/** Separate a 2D convolution into two 1D convolutions
- *
- * @param[in]  conv     2D convolution
- * @param[out] conv_col 1D vertical convolution
- * @param[out] conv_row 1D horizontal convolution
- * @param[in]  size     Size of the 2D convolution
- *
- * @return true if the separation was successful
- */
-inline bool separate_matrix(const int16_t *conv, int16_t *conv_col, int16_t *conv_row, uint8_t size)
-{
-    int32_t min_col     = -1;
-    int16_t min_col_val = -1;
-
-    for(int32_t i = 0; i < size; ++i)
-    {
-        if(conv[i] != 0 && (min_col < 0 || abs(min_col_val) > abs(conv[i])))
-        {
-            min_col     = i;
-            min_col_val = conv[i];
-        }
-    }
-
-    if(min_col < 0)
-    {
-        return false;
-    }
-
-    for(uint32_t j = 0; j < size; ++j)
-    {
-        conv_col[j] = conv[min_col + j * size];
-    }
-
-    for(uint32_t i = 0; i < size; i++)
-    {
-        if(static_cast<int>(i) == min_col)
-        {
-            conv_row[i] = 1;
-        }
-        else
-        {
-            int16_t coeff = conv[i] / conv[min_col];
-
-            for(uint32_t j = 1; j < size; ++j)
-            {
-                if(conv[i + j * size] != (conv_col[j] * coeff))
-                {
-                    return false;
-                }
-            }
-
-            conv_row[i] = coeff;
-        }
-    }
-
-    return true;
-}
-
-/** Calculate the scale of the given square matrix
- *
- * The scale is the absolute value of the sum of all the coefficients in the matrix.
- *
- * @note If the coefficients add up to 0 then the scale is set to 1.
- *
- * @param[in] matrix      Matrix coefficients
- * @param[in] matrix_size Number of elements per side of the square matrix. (Number of coefficients = matrix_size * matrix_size).
- *
- * @return The absolute value of the sum of the coefficients if they don't add up to 0, otherwise 1.
- */
-inline uint32_t calculate_matrix_scale(const int16_t *matrix, unsigned int matrix_size)
-{
-    const size_t size = matrix_size * matrix_size;
-
-    return std::max(1, std::abs(std::accumulate(matrix, matrix + size, 0)));
-}
-
-/** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
- *
- * @note Adding here a few links discussing the issue of odd size and sharing the same solution:
- *       <a href="https://android.googlesource.com/platform/frameworks/base/+/refs/heads/master/graphics/java/android/graphics/YuvImage.java">Android Source</a>
- *       <a href="https://groups.google.com/a/webmproject.org/forum/#!topic/webm-discuss/LaCKpqiDTXM">WebM</a>
- *       <a href="https://bugs.chromium.org/p/libyuv/issues/detail?id=198&amp;can=1&amp;q=odd%20width">libYUV</a>
- *       <a href="https://sourceforge.net/p/raw-yuvplayer/bugs/1/">YUVPlayer</a> *
- *
- * @param[in, out] shape  Tensor shape of 2D size
- * @param[in]      format Format of the tensor
- *
- * @return The adjusted tensor shape.
- */
-inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
-{
-    TensorShape output{ shape };
-
-    // Force width to be even for formats which require subsampling of the U and V channels
-    if(has_format_horizontal_subsampling(format))
-    {
-        output.set(0, output.x() & ~1U);
-    }
-
-    // Force height to be even for formats which require subsampling of the U and V channels
-    if(has_format_vertical_subsampling(format))
-    {
-        output.set(1, output.y() & ~1U);
-    }
-
-    return output;
-}
-
-/** Calculate subsampled shape for a given format and channel
- *
- * @param[in] shape   Shape of the tensor to calculate the extracted channel.
- * @param[in] format  Format of the tensor.
- * @param[in] channel Channel to create tensor shape to be extracted.
- *
- * @return The subsampled tensor shape.
- */
-inline TensorShape calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
-{
-    TensorShape output{ shape };
-
-    // Subsample shape only for U or V channel
-    if(Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
-    {
-        // Subsample width for the tensor shape when channel is U or V
-        if(has_format_horizontal_subsampling(format))
-        {
-            output.set(0, output.x() / 2U);
-        }
-
-        // Subsample height for the tensor shape when channel is U or V
-        if(has_format_vertical_subsampling(format))
-        {
-            output.set(1, output.y() / 2U);
-        }
-    }
-
-    return output;
-}
-
-/** Calculate accurary required by the horizontal and vertical convolution computations
- *
- * @param[in] conv_col Pointer to the vertical vector of the separated convolution filter
- * @param[in] conv_row Pointer to the horizontal vector of the convolution filter
- * @param[in] size     Number of elements per vector of the separated matrix
- *
- * @return The return type is a pair. The first element of the pair is the biggest data type needed for the first stage. The second
- * element of the pair is the biggest data type needed for the second stage.
- */
-inline std::pair<DataType, DataType> data_type_for_convolution(const int16_t *conv_col, const int16_t *conv_row, size_t size)
-{
-    DataType first_stage  = DataType::UNKNOWN;
-    DataType second_stage = DataType::UNKNOWN;
-
-    auto gez = [](const int16_t &v)
-    {
-        return v >= 0;
-    };
-
-    auto accu_neg = [](const int &first, const int &second)
-    {
-        return first + (second < 0 ? second : 0);
-    };
-
-    auto accu_pos = [](const int &first, const int &second)
-    {
-        return first + (second > 0 ? second : 0);
-    };
-
-    const bool only_positive_coefficients = std::all_of(conv_row, conv_row + size, gez) && std::all_of(conv_col, conv_col + size, gez);
-
-    if(only_positive_coefficients)
-    {
-        const int max_row_value = std::accumulate(conv_row, conv_row + size, 0) * UINT8_MAX;
-        const int max_value     = std::accumulate(conv_col, conv_col + size, 0) * max_row_value;
-
-        first_stage = (max_row_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
-
-        second_stage = (max_value <= UINT16_MAX) ? DataType::U16 : DataType::S32;
-    }
-    else
-    {
-        const int min_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_neg) * UINT8_MAX;
-        const int max_row_value  = std::accumulate(conv_row, conv_row + size, 0, accu_pos) * UINT8_MAX;
-        const int neg_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_neg);
-        const int pos_coeffs_sum = std::accumulate(conv_col, conv_col + size, 0, accu_pos);
-        const int min_value      = neg_coeffs_sum * max_row_value + pos_coeffs_sum * min_row_value;
-        const int max_value      = neg_coeffs_sum * min_row_value + pos_coeffs_sum * max_row_value;
-
-        first_stage = ((INT16_MIN <= min_row_value) && (max_row_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
-
-        second_stage = ((INT16_MIN <= min_value) && (max_value <= INT16_MAX)) ? DataType::S16 : DataType::S32;
-    }
-
-    return std::make_pair(first_stage, second_stage);
-}
-
-/** Calculate the accuracy required by the squared convolution calculation.
- *
- *
- * @param[in] conv Pointer to the squared convolution matrix
- * @param[in] size The total size of the convolution matrix
- *
- * @return The return is the biggest data type needed to do the convolution
- */
-inline DataType data_type_for_convolution_matrix(const int16_t *conv, size_t size)
-{
-    auto gez = [](const int16_t v)
-    {
-        return v >= 0;
-    };
-
-    const bool only_positive_coefficients = std::all_of(conv, conv + size, gez);
-
-    if(only_positive_coefficients)
-    {
-        const int max_conv_value = std::accumulate(conv, conv + size, 0) * UINT8_MAX;
-        if(max_conv_value <= UINT16_MAX)
-        {
-            return DataType::U16;
-        }
-        else
-        {
-            return DataType::S32;
-        }
-    }
-    else
-    {
-        const int min_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
-        {
-            return b < 0 ? a + b : a;
-        })
-        * UINT8_MAX;
-
-        const int max_value = std::accumulate(conv, conv + size, 0, [](int a, int b)
-        {
-            return b > 0 ? a + b : a;
-        })
-        * UINT8_MAX;
-
-        if((INT16_MIN <= min_value) && (INT16_MAX >= max_value))
-        {
-            return DataType::S16;
-        }
-        else
-        {
-            return DataType::S32;
-        }
-    }
-}
-
 /** Permutes the given dimensions according the permutation vector
  *
  * @param[in,out] dimensions Dimensions to be permuted.
@@ -911,7 +69,7 @@ template <typename T>
 inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &perm)
 {
     const auto old_dim = utility::make_array<Dimensions<T>::num_max_dimensions>(dimensions.begin(), dimensions.end());
-    for(unsigned int i = 0; i < perm.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < perm.num_dimensions(); ++i)
     {
         T dimension_val = old_dim[i];
         dimensions.set(perm[i], dimension_val);
@@ -929,7 +87,11 @@ inline void permute_strides(Dimensions<T> &dimensions, const PermutationVector &
  *
  * @return PadStrideInfo for SAME padding
  */
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout = DataLayout::NCHW, const Size2D &dilation = Size2D(1u, 1u),
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout   = DataLayout::NCHW,
+                                 const Size2D                &dilation      = Size2D(1u, 1u),
                                  const DimensionRoundingType &rounding_type = DimensionRoundingType::FLOOR);
 
 /** Returns expected width and height of the deconvolution's output tensor.
@@ -942,8 +104,10 @@ PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_sh
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info);
 
 /** Returns expected width and height of output scaled tensor depending on dimensions rounding mode.
@@ -957,11 +121,47 @@ std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned i
  *
  * @return A pair with the new width in the first position and the new height in the second.
  */
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation = Size2D(1U, 1U));
 
+/** Returns calculated width and height of output scaled tensor depending on dimensions rounding mode.
+ *
+ * @param[in] width           Width of input tensor (Number of columns)
+ * @param[in] height          Height of input tensor (Number of rows)
+ * @param[in] kernel_width    Kernel width.
+ * @param[in] kernel_height   Kernel height.
+ * @param[in] pad_stride_info Pad and stride information.
+ *
+ * @return A pair with the new width in the first position and the new height in the second, returned values can be < 1
+ */
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info);
+
+/** Returns calculated width, height and depth of output scaled tensor depending on dimensions rounding mode.
+ *
+ * @param[in] width         Width of input tensor
+ * @param[in] height        Height of input tensor
+ * @param[in] depth         Depth of input tensor
+ * @param[in] kernel_width  Kernel width.
+ * @param[in] kernel_height Kernel height.
+ * @param[in] kernel_depth  Kernel depth.
+ * @param[in] pool3d_info   Pad and stride and round information for 3d pooling
+ *
+ * @return A tuple with the new width in the first position, the new height in the second, and the new depth in the third.
+ *         Returned values can be < 1
+ */
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
+                                                      const Pooling3dLayerInfo &pool3d_info);
+
 /** Check if the given reduction operation should be handled in a serial way.
  *
  * @param[in] op   Reduction operation to perform
@@ -981,16 +181,6 @@ bool needs_serialized_reduction(ReductionOperation op, DataType dt, unsigned int
  */
 QuantizationInfo get_softmax_output_quantization_info(DataType input_type, bool is_log);
 
-/** Returns resize ratio between input and output with consideration of aligned corners
- *
- * @param[in] input_size    The input size
- * @param[in] output_size   the output size
- * @param[in] align_corners True to align corners of input and output. Defaults to false.
- *
- * @return The ratio between input and output (i.e., the input size divided by the output size)
- */
-float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners = false);
-
 /** Returns a pair of minimum and maximum values for a quantized activation
  *
  * @param[in] act_info  The information for activation
@@ -999,15 +189,9 @@ float calculate_resize_ratio(size_t input_size, size_t output_size, bool align_c
  *
  * @return The pair with minimum and maximum values
  */
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(ActivationLayerInfo act_info, DataType data_type, UniformQuantizationInfo oq_info);
-
-/** Convert a tensor format into a string.
- *
- * @param[in] format @ref Format to be translated to string.
- *
- * @return The string describing the format.
- */
-const std::string &string_from_format(Format format);
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info);
 
 /** Convert a channel identity into a string.
  *
@@ -1016,48 +200,7 @@ const std::string &string_from_format(Format format);
  * @return The string describing the channel.
  */
 const std::string &string_from_channel(Channel channel);
-/** Convert a data layout identity into a string.
- *
- * @param[in] dl @ref DataLayout to be translated to string.
- *
- * @return The string describing the data layout.
- */
-const std::string &string_from_data_layout(DataLayout dl);
-/** Convert a data type identity into a string.
- *
- * @param[in] dt @ref DataType to be translated to string.
- *
- * @return The string describing the data type.
- */
-const std::string &string_from_data_type(DataType dt);
-/** Convert a matrix pattern into a string.
- *
- * @param[in] pattern @ref MatrixPattern to be translated to string.
- *
- * @return The string describing the matrix pattern.
- */
-const std::string &string_from_matrix_pattern(MatrixPattern pattern);
-/** Translates a given activation function to a string.
- *
- * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
- *
- * @return The string describing the activation function.
- */
-const std::string &string_from_activation_func(ActivationLayerInfo::ActivationFunction act);
-/** Translates a given non linear function to a string.
- *
- * @param[in] function @ref NonLinearFilterFunction to be translated to string.
- *
- * @return The string describing the non linear function.
- */
-const std::string &string_from_non_linear_filter_function(NonLinearFilterFunction function);
-/** Translates a given interpolation policy to a string.
- *
- * @param[in] policy @ref InterpolationPolicy to be translated to string.
- *
- * @return The string describing the interpolation policy.
- */
-const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
+
 /** Translates a given border mode policy to a string.
  *
  * @param[in] border_mode @ref BorderMode to be translated to string.
@@ -1079,162 +222,67 @@ const std::string &string_from_norm_type(NormType type);
  * @return The string describing the pooling type.
  */
 const std::string &string_from_pooling_type(PoolingType type);
-/** Translates a given GEMMLowp output stage to a string.
- *
- * @param[in] output_stage @ref GEMMLowpOutputStageInfo to be translated to string.
- *
- * @return The string describing the GEMMLowp output stage
- */
-const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage);
-/** Convert a PixelValue to a string, represented through the specific data type
- *
- * @param[in] value     The PixelValue to convert
- * @param[in] data_type The type to be used to convert the @p value
+/** Check if the pool region is entirely outside the input tensor
  *
- * @return String representation of the PixelValue through the given data type.
- */
-std::string string_from_pixel_value(const PixelValue &value, const DataType data_type);
-/** Lower a given string.
+ * @param[in] info @ref PoolingLayerInfo to be checked.
  *
- * @param[in] val Given string to lower.
- *
- * @return The lowered string
+ * @return True if the pool region is entirely outside the input tensor, False otherwise.
  */
-std::string lower_string(const std::string &val);
-
-/** Check if a given data type is of floating point type
+bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info);
+/** Check if the 3d pool region is entirely outside the input tensor
  *
- * @param[in] dt Input data type.
+ * @param[in] info @ref Pooling3dLayerInfo to be checked.
  *
- * @return True if data type is of floating point type, else false.
+ * @return True if the pool region is entirely outside the input tensor, False otherwise.
  */
-inline bool is_data_type_float(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::F16:
-        case DataType::F32:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Check if a given data type is of quantized type
- *
- * @note Quantized is considered a super-set of fixed-point and asymmetric data types.
+bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info);
+/** Check if the 3D padding is symmetric i.e. padding in each opposite sides are euqal (left=right, top=bottom and front=back)
  *
- * @param[in] dt Input data type.
+ * @param[in] info @ref Padding3D input 3D padding object to check if it is symmetric
  *
- * @return True if data type is of quantized type, else false.
+ * @return True if padding is symmetric
  */
-inline bool is_data_type_quantized(DataType dt)
+inline bool is_symmetric(const Padding3D &info)
 {
-    switch(dt)
-    {
-        case DataType::QSYMM8:
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QSYMM8_PER_CHANNEL:
-        case DataType::QSYMM16:
-        case DataType::QASYMM16:
-            return true;
-        default:
-            return false;
-    }
+    return ((info.left == info.right) && (info.top == info.bottom) && (info.front == info.back));
 }
-
-/** Check if a given data type is of asymmetric quantized type
+/** Translates a given GEMMLowp output stage to a string.
  *
- * @param[in] dt Input data type.
+ * @param[in] output_stage @ref GEMMLowpOutputStageInfo to be translated to string.
  *
- * @return True if data type is of asymmetric quantized type, else false.
+ * @return The string describing the GEMMLowp output stage
  */
-inline bool is_data_type_quantized_asymmetric(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QASYMM8:
-        case DataType::QASYMM8_SIGNED:
-        case DataType::QASYMM16:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Check if a given data type is of asymmetric quantized signed type
+const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage);
+/** Convert a PixelValue to a string, represented through the specific data type
  *
- * @param[in] dt Input data type.
+ * @param[in] value     The PixelValue to convert
+ * @param[in] data_type The type to be used to convert the @p value
  *
- * @return True if data type is of asymmetric quantized signed type, else false.
+ * @return String representation of the PixelValue through the given data type.
  */
-inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QASYMM8_SIGNED:
-            return true;
-        default:
-            return false;
-    }
-}
+std::string string_from_pixel_value(const PixelValue &value, const DataType data_type);
 
-/** Check if a given data type is of symmetric quantized type
+/** Stores padding information before configuring a kernel
  *
- * @param[in] dt Input data type.
+ * @param[in] infos list of tensor infos to store the padding info for
  *
- * @return True if data type is of symmetric quantized type, else false.
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
  */
-inline bool is_data_type_quantized_symmetric(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QSYMM8:
-        case DataType::QSYMM8_PER_CHANNEL:
-        case DataType::QSYMM16:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Check if a given data type is of per channel type
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensorInfo *> infos);
+/** Stores padding information before configuring a kernel
  *
- * @param[in] dt Input data type.
+ * @param[in] tensors list of tensors to store the padding info for
  *
- * @return True if data type is of per channel type, else false.
+ * @return An unordered map where each tensor info pointer is paired with its original padding info
  */
-inline bool is_data_type_quantized_per_channel(DataType dt)
-{
-    switch(dt)
-    {
-        case DataType::QSYMM8_PER_CHANNEL:
-            return true;
-        default:
-            return false;
-    }
-}
-
-/** Create a string with the float in full precision.
+std::unordered_map<const ITensorInfo *, PaddingSize> get_padding_info(std::initializer_list<const ITensor *> tensors);
+/** Check if the previously stored padding info has changed after configuring a kernel
  *
- * @param val Floating point value
+ * @param[in] padding_map an unordered map where each tensor info pointer is paired with its original padding info
  *
- * @return String with the floating point value.
+ * @return true if any of the tensor infos has changed its paddings
  */
-inline std::string float_to_string_with_full_precision(float val)
-{
-    std::stringstream ss;
-    ss.precision(std::numeric_limits<float>::max_digits10);
-    ss << val;
-
-    if(val != static_cast<int>(val))
-    {
-        ss << "f";
-    }
-
-    return ss.str();
-}
+bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map);
 
 /** Returns the number of elements required to go from start to end with the wanted step
  *
@@ -1250,67 +298,6 @@ inline size_t num_of_elements_in_range(const float start, const float end, const
     return size_t(std::ceil((end - start) / step));
 }
 
-/** Returns true if the value can be represented by the given data type
- *
- * @param[in] val   value to be checked
- * @param[in] dt    data type that is checked
- * @param[in] qinfo (Optional) quantization info if the data type is QASYMM8
- *
- * @return true if the data type can hold the value.
- */
-template <typename T>
-bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = QuantizationInfo())
-{
-    switch(dt)
-    {
-        case DataType::U8:
-        {
-            const auto val_u8 = static_cast<uint8_t>(val);
-            return ((val_u8 == val) && val_u8 >= std::numeric_limits<uint8_t>::lowest() && val_u8 <= std::numeric_limits<uint8_t>::max());
-        }
-        case DataType::QASYMM8:
-        {
-            double min = static_cast<double>(dequantize_qasymm8(0, qinfo));
-            double max = static_cast<double>(dequantize_qasymm8(std::numeric_limits<uint8_t>::max(), qinfo));
-            return ((double)val >= min && (double)val <= max);
-        }
-        case DataType::S8:
-        {
-            const auto val_s8 = static_cast<int8_t>(val);
-            return ((val_s8 == val) && val_s8 >= std::numeric_limits<int8_t>::lowest() && val_s8 <= std::numeric_limits<int8_t>::max());
-        }
-        case DataType::U16:
-        {
-            const auto val_u16 = static_cast<uint16_t>(val);
-            return ((val_u16 == val) && val_u16 >= std::numeric_limits<uint16_t>::lowest() && val_u16 <= std::numeric_limits<uint16_t>::max());
-        }
-        case DataType::S16:
-        {
-            const auto val_s16 = static_cast<int16_t>(val);
-            return ((val_s16 == val) && val_s16 >= std::numeric_limits<int16_t>::lowest() && val_s16 <= std::numeric_limits<int16_t>::max());
-        }
-        case DataType::U32:
-        {
-            const auto val_u32 = static_cast<uint32_t>(val);
-            return ((val_u32 == val) && val_u32 >= std::numeric_limits<uint32_t>::lowest() && val_u32 <= std::numeric_limits<uint32_t>::max());
-        }
-        case DataType::S32:
-        {
-            const auto val_s32 = static_cast<int32_t>(val);
-            return ((val_s32 == val) && val_s32 >= std::numeric_limits<int32_t>::lowest() && val_s32 <= std::numeric_limits<int32_t>::max());
-        }
-        case DataType::BFLOAT16:
-            return (val >= bfloat16::lowest() && val <= bfloat16::max());
-        case DataType::F16:
-            return (val >= std::numeric_limits<half>::lowest() && val <= std::numeric_limits<half>::max());
-        case DataType::F32:
-            return (val >= std::numeric_limits<float>::lowest() && val <= std::numeric_limits<float>::max());
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported");
-            return false;
-    }
-}
-
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
 /** Print consecutive elements to an output stream.
  *
@@ -1321,26 +308,27 @@ bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = Quantization
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
 template <typename T>
-void print_consecutive_elements_impl(std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
+void print_consecutive_elements_impl(
+    std::ostream &s, const T *ptr, unsigned int n, int stream_width = 0, const std::string &element_delim = " ")
 {
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
     std::ios stream_status(nullptr);
     stream_status.copyfmt(s);
 
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         // Set stream width as it is not a "sticky" stream manipulator
-        if(stream_width != 0)
+        if (stream_width != 0)
         {
             s.width(stream_width);
         }
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             s << std::right << static_cast<T>(ptr[i]) << element_delim;
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat16> returns false and then the print_type becomes int.
             s << std::right << float(ptr[i]) << element_delim;
@@ -1369,17 +357,17 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
     using print_type = typename std::conditional<std::is_floating_point<T>::value, T, int>::type;
 
     int max_width = -1;
-    for(unsigned int i = 0; i < n; ++i)
+    for (unsigned int i = 0; i < n; ++i)
     {
         std::stringstream ss;
         ss.copyfmt(s);
 
-        if(std::is_same<typename std::decay<T>::type, half>::value)
+        if (std::is_same<typename std::decay<T>::type, half>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<half> returns false and then the print_type becomes int.
             ss << static_cast<T>(ptr[i]);
         }
-        else if(std::is_same<typename std::decay<T>::type, bfloat16>::value)
+        else if (std::is_same<typename std::decay<T>::type, bfloat16>::value)
         {
             // We use T instead of print_type here is because the std::is_floating_point<bfloat> returns false and then the print_type becomes int.
             ss << float(ptr[i]);
@@ -1403,7 +391,12 @@ int max_consecutive_elements_display_width_impl(std::ostream &s, const T *ptr, u
  * @param[in]  stream_width  (Optional) Width of the stream. If set to 0 the element's width is used. Defaults to 0.
  * @param[in]  element_delim (Optional) Delimeter among the consecutive elements. Defaults to space delimeter
  */
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim = " ");
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim = " ");
 
 /** Identify the maximum width of n consecutive elements.
  *
@@ -1416,5 +409,5 @@ void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr
  */
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n);
 #endif /* ARM_COMPUTE_ASSERTS_ENABLED */
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index bbea5e5575..5550560aff 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,12 @@
 #define ARM_COMPUTE_VALIDATE_H
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
 #include "arm_compute/core/IKernel.h"
-#include "arm_compute/core/IMultiHOG.h"
-#include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/MultiImageInfo.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/utils/DataLayoutUtils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/FormatUtils.h"
 #include "arm_compute/core/Window.h"
 
 #include <algorithm>
@@ -50,9 +50,9 @@ namespace detail
 template <typename T>
 inline bool have_different_dimensions(const Dimensions<T> &dim1, const Dimensions<T> &dim2, unsigned int upper_dim)
 {
-    for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
+    for (unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
     {
-        if(dim1[i] != dim2[i])
+        if (dim1[i] != dim2[i])
         {
             return true;
         }
@@ -80,7 +80,7 @@ public:
      * @param[in] line     Source code line. Used for error reporting.
      */
     compare_dimension(const Dimensions<T> &dim, const char *function, const char *file, int line)
-        : _dim{ dim }, _function{ function }, _file{ file }, _line{ line }
+        : _dim{dim}, _function{function}, _file{file}, _line{line}
     {
     }
 
@@ -111,7 +111,7 @@ inline arm_compute::Status for_each_error(F &&)
 }
 
 template <typename F, typename T, typename... Ts>
-inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
+inline arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&...args)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(func(arg));
     ARM_COMPUTE_RETURN_ON_ERROR(for_each_error(func, args...));
@@ -148,13 +148,11 @@ struct get_tensor_info_t<ITensorInfo *>
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&... pointers)
+inline arm_compute::Status error_on_nullptr(const char *function, const char *file, const int line, Ts &&...pointers)
 {
-    const std::array<const void *, sizeof...(Ts)> pointers_array{ { std::forward<Ts>(pointers)... } };
-    bool has_nullptr = std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr)
-    {
-        return (ptr == nullptr);
-    });
+    const std::array<const void *, sizeof...(Ts)> pointers_array{{std::forward<Ts>(pointers)...}};
+    bool                                          has_nullptr =
+        std::any_of(pointers_array.begin(), pointers_array.end(), [&](const void *ptr) { return (ptr == nullptr); });
     ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(has_nullptr, function, file, line, "Nullptr object!");
     return arm_compute::Status{};
 }
@@ -178,8 +176,8 @@ inline arm_compute::Status error_on_nullptr(const char *function, const char *fi
  *
  * @return Status
  */
-arm_compute::Status error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                 const Window &full, const Window &win);
+arm_compute::Status error_on_mismatching_windows(
+    const char *function, const char *file, const int line, const Window &full, const Window &win);
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_windows(__func__, __FILE__, __LINE__, f, w))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_WINDOWS(f, w) \
@@ -200,8 +198,8 @@ arm_compute::Status error_on_mismatching_windows(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                               const Window &full, const Window &sub);
+arm_compute::Status error_on_invalid_subwindow(
+    const char *function, const char *file, const int line, const Window &full, const Window &sub);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subwindow(__func__, __FILE__, __LINE__, f, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBWINDOW(f, s) \
@@ -220,12 +218,14 @@ arm_compute::Status error_on_invalid_subwindow(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                 const Window &full, const Window &window, const int dim);
+arm_compute::Status error_on_window_not_collapsable_at_dimension(
+    const char *function, const char *file, const int line, const Window &full, const Window &window, const int dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_ERROR_THROW_ON(                                           \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_NOT_COLLAPSABLE_AT_DIMENSION(f, w, d) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                                 \
+        ::arm_compute::error_on_window_not_collapsable_at_dimension(__func__, __FILE__, __LINE__, f, w, d))
 
 /** Return an error if the passed coordinates have too many dimensions.
  *
@@ -239,8 +239,8 @@ arm_compute::Status error_on_window_not_collapsable_at_dimension(const char *fun
  *
  * @return Status
  */
-arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                        const Coordinates &pos, unsigned int max_dim);
+arm_compute::Status error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const Coordinates &pos, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_coordinates_dimensions_gte(__func__, __FILE__, __LINE__, p, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_COORDINATES_DIMENSIONS_GTE(p, md) \
@@ -258,8 +258,8 @@ arm_compute::Status error_on_coordinates_dimensions_gte(const char *function, co
  *
  * @return Status
  */
-arm_compute::Status error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                   const Window &win, unsigned int max_dim);
+arm_compute::Status error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const Window &win, unsigned int max_dim);
 #define ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_window_dimensions_gte(__func__, __FILE__, __LINE__, w, md))
 #define ARM_COMPUTE_RETURN_ERROR_ON_WINDOW_DIMENSIONS_GTE(w, md) \
@@ -277,16 +277,82 @@ arm_compute::Status error_on_window_dimensions_gte(const char *function, const c
  * @return Status
  */
 template <typename T, typename... Ts>
-arm_compute::Status error_on_mismatching_dimensions(const char *function, const char *file, int line,
-                                                    const Dimensions<T> &dim1, const Dimensions<T> &dim2, Ts &&... dims)
+arm_compute::Status error_on_mismatching_dimensions(const char          *function,
+                                                    const char          *file,
+                                                    int                  line,
+                                                    const Dimensions<T> &dim1,
+                                                    const Dimensions<T> &dim2,
+                                                    Ts &&...dims)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2, std::forward<Ts>(dims)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(detail::for_each_error(detail::compare_dimension<T>(dim1, function, file, line), dim2,
+                                                       std::forward<Ts>(dims)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_dimensions(__func__, __FILE__, __LINE__, __VA_ARGS__))
+
+/** Return true if the given format has horizontal subsampling.
+ *
+ * @param[in] format Format to determine subsampling.
+ *
+ * @return True if the format can be subsampled horizontaly.
+ */
+inline bool has_format_horizontal_subsampling(Format format)
+{
+    return (format == Format::YUYV422 || format == Format::UYVY422 || format == Format::NV12 ||
+            format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
+}
+
+/** Return true if the given format has vertical subsampling.
+ *
+ * @param[in] format Format to determine subsampling.
+ *
+ * @return True if the format can be subsampled verticaly.
+ */
+inline bool has_format_vertical_subsampling(Format format)
+{
+    return (format == Format::NV12 || format == Format::NV21 || format == Format::IYUV || format == Format::UV88)
+               ? true
+               : false;
+}
+
+/** Adjust tensor shape size if width or height are odd for a given multi-planar format. No modification is done for other formats.
+ *
+ * @note Adding here a few links discussing the issue of odd size and sharing the same solution:
+ *       <a href="https://android.googlesource.com/platform/frameworks/base/+/refs/heads/master/graphics/java/android/graphics/YuvImage.java">Android Source</a>
+ *       <a href="https://groups.google.com/a/webmproject.org/forum/#!topic/webm-discuss/LaCKpqiDTXM">WebM</a>
+ *       <a href="https://bugs.chromium.org/p/libyuv/issues/detail?id=198&amp;can=1&amp;q=odd%20width">libYUV</a>
+ *       <a href="https://sourceforge.net/p/raw-yuvplayer/bugs/1/">YUVPlayer</a> *
+ *
+ * @param[in, out] shape  Tensor shape of 2D size
+ * @param[in]      format Format of the tensor
+ *
+ * @return The adjusted tensor shape.
+ */
+inline TensorShape adjust_odd_shape(const TensorShape &shape, Format format)
+{
+    TensorShape output{shape};
+
+    // Force width to be even for formats which require subsampling of the U and V channels
+    if (has_format_horizontal_subsampling(format))
+    {
+        output.set(0, (output.x() + 1) & ~1U);
+    }
+
+    // Force height to be even for formats which require subsampling of the U and V channels
+    if (has_format_vertical_subsampling(format))
+    {
+        output.set(1, (output.y() + 1) & ~1U);
+    }
+
+    return output;
+}
 
 /** Return an error if the passed tensor objects are not even.
  *
@@ -300,18 +366,20 @@ arm_compute::Status error_on_mismatching_dimensions(const char *function, const
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_even(const char *function, const char *file, int line,
-                                              const Format &format, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_even(
+    const char *function, const char *file, int line, const Format &format, const ITensor *tensor1, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
-    }),
-    function, file, line, "Tensor shape has odd dimensions");
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    {
+                        const TensorShape correct_shape = adjust_odd_shape(tensor->info()->tensor_shape(), format);
+                        return detail::have_different_dimensions(tensor->info()->tensor_shape(), correct_shape, 2);
+                    }),
+        function, file, line, "Tensor shape has odd dimensions");
     return arm_compute::Status{};
 }
 
@@ -320,6 +388,38 @@ arm_compute::Status error_on_tensors_not_even(const char *function, const char *
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSORS_NOT_EVEN(...) \
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensors_not_even(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
+/** Calculate subsampled shape for a given format and channel
+ *
+ * @param[in] shape   Shape of the tensor to calculate the extracted channel.
+ * @param[in] format  Format of the tensor.
+ * @param[in] channel Channel to create tensor shape to be extracted.
+ *
+ * @return The subsampled tensor shape.
+ */
+inline TensorShape
+calculate_subsampled_shape(const TensorShape &shape, Format format, Channel channel = Channel::UNKNOWN)
+{
+    TensorShape output{shape};
+
+    // Subsample shape only for U or V channel
+    if (Channel::U == channel || Channel::V == channel || Channel::UNKNOWN == channel)
+    {
+        // Subsample width for the tensor shape when channel is U or V
+        if (has_format_horizontal_subsampling(format))
+        {
+            output.set(0, output.x() / 2U);
+        }
+
+        // Subsample height for the tensor shape when channel is U or V
+        if (has_format_vertical_subsampling(format))
+        {
+            output.set(1, output.y() / 2U);
+        }
+    }
+
+    return output;
+}
+
 /** Return an error if the passed tensor objects are not sub-sampled.
  *
  * @param[in] function Function in which the error occurred.
@@ -333,25 +433,32 @@ arm_compute::Status error_on_tensors_not_even(const char *function, const char *
  * @return Status
  */
 template <typename... Ts>
-arm_compute::Status error_on_tensors_not_subsampled(const char *function, const char *file, int line,
-                                                    const Format &format, const TensorShape &shape, const ITensor *tensor1, Ts... tensors)
+arm_compute::Status error_on_tensors_not_subsampled(const char        *function,
+                                                    const char        *file,
+                                                    int                line,
+                                                    const Format      &format,
+                                                    const TensorShape &shape,
+                                                    const ITensor     *tensor1,
+                                                    Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    const TensorShape sub2_shape = calculate_subsampled_shape(shape, format);
-    const std::array < const ITensor *, 1 + sizeof...(Ts) > tensors_info_array{ { tensor1, std::forward<Ts>(tensors)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(), [&](const ITensor * tensor)
-    {
-        return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2);
-    }),
-    function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
+    const TensorShape                                    sub2_shape = calculate_subsampled_shape(shape, format);
+    const std::array<const ITensor *, 1 + sizeof...(Ts)> tensors_info_array{{tensor1, std::forward<Ts>(tensors)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensors_info_array.cbegin(), tensors_info_array.cend(),
+                    [&](const ITensor *tensor)
+                    { return detail::have_different_dimensions(tensor->info()->tensor_shape(), sub2_shape, 2); }),
+        function, file, line, "Tensor shape has mismatch dimensions for sub-sampling");
     return arm_compute::Status{};
 }
 
 #define ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_tensors_not_subsampled(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different shapes from the given dimension
  *
@@ -365,10 +472,15 @@ arm_compute::Status error_on_tensors_not_subsampled(const char *function, const
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
-    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)...);
+    return error_on_mismatching_shapes(function, file, line, 0U, tensor_info_1, tensor_info_2,
+                                       std::forward<Ts>(tensor_infos)...);
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
  *
@@ -382,8 +494,12 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     return error_on_mismatching_shapes(function, file, line, 0U, tensor_1, tensor_2, std::forward<Ts>(tensors)...);
 }
@@ -400,19 +516,28 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_shapes(const char        *function,
+                                                       const char        *file,
+                                                       const int          line,
+                                                       unsigned int       upper_dim,
+                                                       const ITensorInfo *tensor_info_1,
+                                                       const ITensorInfo *tensor_info_2,
+                                                       Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info_2 == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    const std::array < const ITensorInfo *, 2 + sizeof...(Ts) > tensors_info_array{ { tensor_info_1, tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(), [&](const ITensorInfo * tensor_info)
-    {
-        return detail::have_different_dimensions((*tensors_info_array.cbegin())->tensor_shape(), tensor_info->tensor_shape(), upper_dim);
-    }),
-    function, file, line, "Tensors have different shapes");
+    const std::array<const ITensorInfo *, 2 + sizeof...(Ts)> tensors_info_array{
+        {tensor_info_1, tensor_info_2, tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(std::next(tensors_info_array.cbegin()), tensors_info_array.cend(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    {
+                                                        return detail::have_different_dimensions(
+                                                            (*tensors_info_array.cbegin())->tensor_shape(),
+                                                            tensor_info->tensor_shape(), upper_dim);
+                                                    }),
+                                        function, file, line, "Tensors have different shapes");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different shapes from the given dimension
@@ -428,14 +553,20 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_shapes(const char *function, const char *file, const int line,
-                                                       unsigned int upper_dim, const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_shapes(const char    *function,
+                                                       const char    *file,
+                                                       const int      line,
+                                                       unsigned int   upper_dim,
+                                                       const ITensor *tensor_1,
+                                                       const ITensor *tensor_2,
+                                                       Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_1 == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_2 == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
-                                                                           detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_shapes(function, file, line, upper_dim, tensor_1->info(), tensor_2->info(),
+                                                   detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(...) \
@@ -454,19 +585,18 @@ inline arm_compute::Status error_on_mismatching_shapes(const char *function, con
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataLayout &&tensor_data_layout = tensor_info->data_layout();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_layout() != tensor_data_layout;
-    }),
-    function, file, line, "Tensors have different data layouts");
+    DataLayout                                         &&tensor_data_layout = tensor_info->data_layout();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_layout() != tensor_data_layout; }),
+                                        function, file, line, "Tensors have different data layouts");
     return arm_compute::Status{};
 }
 /** Return an error if the passed tensors have different data layouts
@@ -480,19 +610,21 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_layouts(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_layouts(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(function, file, line, tensor->info(),
-                                                                                 detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                           \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                 \
+        ::arm_compute::error_on_mismatching_data_layouts(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed two tensor infos have different data types
  *
@@ -505,19 +637,18 @@ inline arm_compute::Status error_on_mismatching_data_layouts(const char *functio
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensorInfo *tensor_info, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, Ts... tensor_infos)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensor_infos)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensor_infos...));
 
-    DataType &&tensor_data_type = tensor_info->data_type();
-    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{ { std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(), [&](const ITensorInfo * tensor_info_obj)
-    {
-        return tensor_info_obj->data_type() != tensor_data_type;
-    }),
-    function, file, line, "Tensors have different data types");
+    DataType                                           &&tensor_data_type = tensor_info->data_type();
+    const std::array<const ITensorInfo *, sizeof...(Ts)> tensors_infos_array{{tensor_infos...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensors_infos_array.begin(), tensors_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info_obj)
+                                                    { return tensor_info_obj->data_type() != tensor_data_type; }),
+                                        function, file, line, "Tensors have different data types");
     return arm_compute::Status{};
 }
 /** Return an error if the passed two tensors have different data types
@@ -531,19 +662,21 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_data_types(const char *function, const char *file, const int line,
-                                                           const ITensor *tensor, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_data_types(
+    const char *function, const char *file, const int line, const ITensor *tensor, Ts... tensors)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, std::forward<Ts>(tensors)...));
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(function, file, line, tensor->info(),
-                                                                               detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_nullptr(function, file, line, tensors...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(
+        function, file, line, tensor->info(), detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                          \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                \
+        ::arm_compute::error_on_mismatching_data_types(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Return an error if the passed tensor infos have different asymmetric quantized data types or different quantization info
  *
@@ -559,28 +692,32 @@ inline arm_compute::Status error_on_mismatching_data_types(const char *function,
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char        *function,
+                                                                  const char        *file,
+                                                                  const int          line,
+                                                                  const ITensorInfo *tensor_info_1,
+                                                                  const ITensorInfo *tensor_info_2,
+                                                                  Ts... tensor_infos)
 {
     DataType             &&first_data_type         = tensor_info_1->data_type();
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    if(!is_data_type_quantized(first_data_type))
+    if (!is_data_type_quantized(first_data_type))
     {
         return arm_compute::Status{};
     }
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->data_type() != first_data_type;
-    }),
-    function, file, line, "Tensors have different asymmetric quantized data types");
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    }),
-    function, file, line, "Tensors have different quantization information");
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                                                    [&](const ITensorInfo *tensor_info)
+                                                    { return tensor_info->data_type() != first_data_type; }),
+                                        function, file, line, "Tensors have different asymmetric quantized data types");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                    [&](const ITensorInfo *tensor_info)
+                    { return tensor_info->quantization_info() != first_quantization_info; }),
+        function, file, line, "Tensors have different quantization information");
 
     return arm_compute::Status{};
 }
@@ -598,17 +735,24 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @return Status
  */
 template <typename... Ts>
-inline arm_compute::Status error_on_mismatching_quantization_info(const char *function, const char *file, const int line,
-                                                                  const ITensor *tensor_1, const ITensor *tensor_2, Ts... tensors)
+inline arm_compute::Status error_on_mismatching_quantization_info(const char    *function,
+                                                                  const char    *file,
+                                                                  const int      line,
+                                                                  const ITensor *tensor_1,
+                                                                  const ITensor *tensor_2,
+                                                                  Ts... tensors)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
-                                                                                      detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_mismatching_quantization_info(function, file, line, tensor_1->info(), tensor_2->info(),
+                                                              detail::get_tensor_info_t<ITensorInfo *>()(tensors)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                 \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                       \
+        ::arm_compute::error_on_mismatching_quantization_info(__func__, __FILE__, __LINE__, __VA_ARGS__))
 
 /** Throw an error if the format of the passed tensor/multi-image does not match any of the formats provided.
  *
@@ -620,8 +764,8 @@ inline arm_compute::Status error_on_mismatching_quantization_info(const char *fu
  * @param[in] formats  (Optional) Further allowed formats.
  */
 template <typename T, typename F, typename... Fs>
-void error_on_format_not_in(const char *function, const char *file, const int line,
-                            const T *object, F &&format, Fs &&... formats)
+void error_on_format_not_in(
+    const char *function, const char *file, const int line, const T *object, F &&format, Fs &&...formats)
 {
     ARM_COMPUTE_ERROR_ON_LOC(object == nullptr, function, file, line);
 
@@ -630,17 +774,17 @@ void error_on_format_not_in(const char *function, const char *file, const int li
 
     ARM_COMPUTE_ERROR_ON_LOC(object_format == Format::UNKNOWN, function, file, line);
 
-    const std::array<F, sizeof...(Fs)> formats_array{ { std::forward<Fs>(formats)... } };
+    const std::array<F, sizeof...(Fs)> formats_array{{std::forward<Fs>(formats)...}};
     ARM_COMPUTE_UNUSED(formats_array);
 
-    ARM_COMPUTE_ERROR_ON_LOC_MSG(object_format != format && std::none_of(formats_array.begin(), formats_array.end(), [&](const F & f)
-    {
-        return f == object_format;
-    }),
-    function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
+    ARM_COMPUTE_ERROR_ON_LOC_MSG(
+        object_format != format &&
+            std::none_of(formats_array.begin(), formats_array.end(), [&](const F &f) { return f == object_format; }),
+        function, file, line, "Format %s not supported by this kernel", string_from_format(object_format).c_str());
     ARM_COMPUTE_UNUSED(function, format, file, line);
 }
-#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
+#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t, ...) \
+    ::arm_compute::error_on_format_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__)
 
 /** Return an error if the data type of the passed tensor info does not match any of the data types provided.
  *
@@ -654,20 +798,19 @@ void error_on_format_not_in(const char *function, const char *file, const int li
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataType &tensor_dt = tensor_info->data_type(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dt == DataType::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dts_array{ { std::forward<Ts>(dts)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T & d)
-    {
-        return d == tensor_dt;
-    }),
-    function, file, line, "ITensor data type %s not supported by this kernel", string_from_data_type(tensor_dt).c_str());
+    const std::array<T, sizeof...(Ts)> dts_array{{std::forward<Ts>(dts)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dt != dt && std::none_of(dts_array.begin(), dts_array.end(), [&](const T &d) { return d == tensor_dt; }),
+        function, file, line, "ITensor data type %s not supported by this kernel",
+        string_from_data_type(tensor_dt).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data type of the passed tensor does not match any of the data types provided.
@@ -682,11 +825,12 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_not_in(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dt, Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(t, ...) \
@@ -706,20 +850,19 @@ inline arm_compute::Status error_on_data_type_not_in(const char *function, const
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensorInfo *tensor_info, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
 
     const DataLayout &tensor_dl = tensor_info->data_layout(); //NOLINT
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_dl == DataLayout::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> dls_array{ { std::forward<Ts>(dls)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T & l)
-    {
-        return l == tensor_dl;
-    }),
-    function, file, line, "ITensor data layout %s not supported by this kernel", string_from_data_layout(tensor_dl).c_str());
+    const std::array<T, sizeof...(Ts)> dls_array{{std::forward<Ts>(dls)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+        tensor_dl != dl && std::none_of(dls_array.begin(), dls_array.end(), [&](const T &l) { return l == tensor_dl; }),
+        function, file, line, "ITensor data layout %s not supported by this kernel",
+        string_from_data_layout(tensor_dl).c_str());
     return arm_compute::Status{};
 }
 /** Return an error if the data layout of the passed tensor does not match any of the data layout provided.
@@ -734,17 +877,19 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_layout_not_in(const char *function, const char *file, const int line,
-                                                       const ITensor *tensor, T &&dl, Ts &&... dls)
+inline arm_compute::Status error_on_data_layout_not_in(
+    const char *function, const char *file, const int line, const ITensor *tensor, T &&dl, Ts &&...dls)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(
+        function, file, line, tensor->info(), std::forward<T>(dl), std::forward<Ts>(dls)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(t, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                               \
+        ::arm_compute::error_on_data_layout_not_in(__func__, __FILE__, __LINE__, t, __VA_ARGS__))
 
 /** Return an error if the data type or the number of channels of the passed tensor info does not match any of the data types and number of channels provided.
  *
@@ -759,12 +904,20 @@ inline arm_compute::Status error_on_data_layout_not_in(const char *function, con
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensorInfo *tensor_info, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char        *function,
+                                                             const char        *file,
+                                                             const int          line,
+                                                             const ITensorInfo *tensor_info,
+                                                             size_t             num_channels,
+                                                             T                &&dt,
+                                                             Ts &&...dts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_not_in(
+        function, file, line, tensor_info, std::forward<T>(dt), std::forward<Ts>(dts)...));
     const size_t tensor_nc = tensor_info->num_channels();
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line, "Number of channels %zu. Required number of channels %zu", tensor_nc, num_channels);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor_nc != num_channels, function, file, line,
+                                            "Number of channels %zu. Required number of channels %zu", tensor_nc,
+                                            num_channels);
     return arm_compute::Status{};
 }
 /** Return an error if the data type or the number of channels of the passed tensor does not match any of the data types and number of channels provided.
@@ -780,17 +933,25 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_data_type_channel_not_in(const char *function, const char *file, const int line,
-                                                             const ITensor *tensor, size_t num_channels, T &&dt, Ts &&... dts)
+inline arm_compute::Status error_on_data_type_channel_not_in(const char    *function,
+                                                             const char    *file,
+                                                             const int      line,
+                                                             const ITensor *tensor,
+                                                             size_t         num_channels,
+                                                             T            &&dt,
+                                                             Ts &&...dts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels, std::forward<T>(dt), std::forward<Ts>(dts)...));
+    ARM_COMPUTE_RETURN_ON_ERROR(error_on_data_type_channel_not_in(function, file, line, tensor->info(), num_channels,
+                                                                  std::forward<T>(dt), std::forward<Ts>(dts)...));
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_ERROR_THROW_ON(                                  \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 #define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c, ...) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
+    ARM_COMPUTE_RETURN_ON_ERROR(                                        \
+        ::arm_compute::error_on_data_type_channel_not_in(__func__, __FILE__, __LINE__, t, c, __VA_ARGS__))
 
 /** Return an error if the data type of the passed tensor info is FP16 and FP16 extension is not supported by the device.
  *
@@ -802,12 +963,12 @@ inline arm_compute::Status error_on_data_type_channel_not_in(const char *functio
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensorInfo *tensor_info, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensorInfo *tensor_info, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported),
-                                        function, file, line, "FP16 not supported by the device");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16 && !is_fp16_supported), function,
+                                        file, line, "FP16 not supported by the device");
     return arm_compute::Status{};
 }
 
@@ -821,11 +982,12 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_fp16(const char *function, const char *file, const int line,
-                                                     const ITensor *tensor, bool is_fp16_supported)
+inline arm_compute::Status error_on_unsupported_fp16(
+    const char *function, const char *file, const int line, const ITensor *tensor, bool is_fp16_supported)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ::arm_compute::error_on_unsupported_fp16(function, file, line, tensor->info(), is_fp16_supported));
     return arm_compute::Status{};
 }
 
@@ -838,8 +1000,8 @@ inline arm_compute::Status error_on_unsupported_fp16(const char *function, const
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensor *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensor *tensor);
 
 /** Return an error if the tensor info is not 2D.
  *
@@ -850,8 +1012,8 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  *
  * @return Status
  */
-arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
-                                           const ITensorInfo *tensor);
+arm_compute::Status
+error_on_tensor_not_2d(const char *function, const char *file, const int line, const ITensorInfo *tensor);
 
 #define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
@@ -870,17 +1032,15 @@ arm_compute::Status error_on_tensor_not_2d(const char *function, const char *fil
  * @return Status
  */
 template <typename T, typename... Ts>
-inline arm_compute::Status error_on_channel_not_in(const char *function, const char *file, const int line,
-                                                   T cn, T &&channel, Ts &&... channels)
+inline arm_compute::Status
+error_on_channel_not_in(const char *function, const char *file, const int line, T cn, T &&channel, Ts &&...channels)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == Channel::UNKNOWN, function, file, line);
 
-    const std::array<T, sizeof...(Ts)> channels_array{ { std::forward<Ts>(channels)... } };
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(), [&](const T & f)
-    {
-        return f == cn;
-    }),
-    function, file, line);
+    const std::array<T, sizeof...(Ts)> channels_array{{std::forward<Ts>(channels)...}};
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(channel != cn && std::none_of(channels_array.begin(), channels_array.end(),
+                                                                  [&](const T &f) { return f == cn; }),
+                                    function, file, line);
     return arm_compute::Status{};
 }
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN(c, ...) \
@@ -898,35 +1058,13 @@ inline arm_compute::Status error_on_channel_not_in(const char *function, const c
  *
  * @return Status
  */
-arm_compute::Status error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                         Format fmt, Channel cn);
+arm_compute::Status
+error_on_channel_not_in_known_format(const char *function, const char *file, const int line, Format fmt, Channel cn);
 #define ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 #define ARM_COMPUTE_RETURN_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(f, c) \
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_channel_not_in_known_format(__func__, __FILE__, __LINE__, f, c))
 
-/** Return an error if the @ref IMultiHOG container is invalid
- *
- * An @ref IMultiHOG container is invalid if:
- *
- * -# it is a nullptr
- * -# it doesn't contain models
- * -# it doesn't have the HOG data objects with the same phase_type, normalization_type and l2_hyst_threshold (if normalization_type == L2HYS_NORM)
- *
- * @param[in] function  Function in which the error occurred.
- * @param[in] file      Name of the file where the error occurred.
- * @param[in] line      Line on which the error occurred.
- * @param[in] multi_hog IMultiHOG container to validate
- *
- * @return Status
- */
-arm_compute::Status error_on_invalid_multi_hog(const char *function, const char *file, const int line,
-                                               const IMultiHOG *multi_hog);
-#define ARM_COMPUTE_ERROR_ON_INVALID_MULTI_HOG(m) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m))
-#define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_MULTI_HOG(m) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_multi_hog(__func__, __FILE__, __LINE__, m))
-
 /** Return an error if the kernel is not configured.
  *
  * @param[in] function Function in which the error occurred.
@@ -936,8 +1074,8 @@ arm_compute::Status error_on_invalid_multi_hog(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_unconfigured_kernel(const char *function, const char *file, const int line,
-                                                 const IKernel *kernel);
+arm_compute::Status
+error_on_unconfigured_kernel(const char *function, const char *file, const int line, const IKernel *kernel);
 #define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unconfigured_kernel(__func__, __FILE__, __LINE__, k))
 #define ARM_COMPUTE_RETURN_ERROR_ON_UNCONFIGURED_KERNEL(k) \
@@ -954,8 +1092,12 @@ arm_compute::Status error_on_unconfigured_kernel(const char *function, const cha
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                               const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape);
+arm_compute::Status error_on_invalid_subtensor(const char        *function,
+                                               const char        *file,
+                                               const int          line,
+                                               const TensorShape &parent_shape,
+                                               const Coordinates &coords,
+                                               const TensorShape &shape);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor(__func__, __FILE__, __LINE__, p, c, s))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR(p, c, s) \
@@ -971,11 +1113,16 @@ arm_compute::Status error_on_invalid_subtensor(const char *function, const char
  *
  * @return Status
  */
-arm_compute::Status error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                            const ValidRegion &parent_valid_region, const ValidRegion &valid_region);
+arm_compute::Status error_on_invalid_subtensor_valid_region(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const ValidRegion &parent_valid_region,
+                                                            const ValidRegion &valid_region);
 #define ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+    ARM_COMPUTE_ERROR_THROW_ON(                                     \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
 #define ARM_COMPUTE_RETURN_ERROR_ON_INVALID_SUBTENSOR_VALID_REGION(pv, sv) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
-}
+    ARM_COMPUTE_RETURN_ON_ERROR(                                           \
+        ::arm_compute::error_on_invalid_subtensor_valid_region(__func__, __FILE__, __LINE__, pv, sv))
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_VALIDATE_H*/
diff --git a/arm_compute/core/Version.h b/arm_compute/core/Version.h
index be3f0264bb..44d400bad8 100644
--- a/arm_compute/core/Version.h
+++ b/arm_compute/core/Version.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,12 +27,12 @@
 #include <string>
 
 /* Macro utilities */
-#define STRINGIFY2(s) #s
-#define STRINGIFY(s) STRINGIFY2(s)
+#define ARM_COMPUTE_STRINGIFY2(s) #s
+#define ARM_COMPUTE_STRINGIFY(s)  ARM_COMPUTE_STRINGIFY2(s)
 
-#define ARM_COMPUTE_VERSION_STR          \
-    STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
-    "." STRINGIFY(ARM_COMPUTE_VERSION_MINOR) "." STRINGIFY(ARM_COMPUTE_VERSION_PATCH)
+#define ARM_COMPUTE_VERSION_STR                      \
+    ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MAJOR) \
+    "." ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_MINOR) "." ARM_COMPUTE_STRINGIFY(ARM_COMPUTE_VERSION_PATCH)
 
 namespace arm_compute
 {
@@ -45,4 +45,7 @@ namespace arm_compute
 std::string build_information();
 } // namespace arm_compute
 
+#undef ARM_COMPUTE_STRINGIFY
+#undef ARM_COMPUTE_STRINGIFY2
+
 #endif /* ARM_COMPUTE_LIBRARY_VERSION_H */
diff --git a/arm_compute/core/Window.h b/arm_compute/core/Window.h
index d6690d484a..e93d2863c9 100644
--- a/arm_compute/core/Window.h
+++ b/arm_compute/core/Window.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WINDOW_H
-#define ARM_COMPUTE_WINDOW_H
-
-#include <algorithm>
-#include <array>
-#include <cstddef>
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_H
+#define ACL_ARM_COMPUTE_CORE_WINDOW_H
 
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/math/Math.h"
+
+#include <algorithm>
+#include <array>
+#include <cstddef>
 
 namespace arm_compute
 {
@@ -45,6 +45,10 @@ public:
     static constexpr size_t DimY = 1;
     /** Alias for dimension 2 also known as Z dimension */
     static constexpr size_t DimZ = 2;
+    /** Alias for dimension 3 also known as W dimension */
+    static constexpr size_t DimW = 3;
+    /** Alias for dimension 4 also known as V dimension */
+    static constexpr size_t DimV = 4;
 
     /** Default constructor: create a window containing a single element. */
     constexpr Window()
@@ -82,10 +86,10 @@ public:
          * @param[in] step  Step between two elements of the dimension when iterating.
          *
          */
-        constexpr Dimension(int start = 0, int end = 1, int step = 1)
-            : _start(start), _end(end), _step(step)
+        constexpr Dimension(int start = 0, int end = 1, int step = 1) : _start(start), _end(end), _step(step)
         {
         }
+        Dimension(const Dimension &d) = default;
         /** Default assignment operator to allow dimensions to be copied */
         Dimension &operator=(const Dimension &d) = default;
         /** Return the start of the dimension */
@@ -119,6 +123,17 @@ public:
         {
             _end = end;
         }
+        /** Check whether two Dimensions are equal.
+         *
+         * @param[in] lhs LHS Dimensions
+         * @param[in] rhs RHS Dimensions
+         *
+         * @return True if the Dimensions are the same.
+         */
+        friend bool operator==(const Dimension &lhs, const Dimension &rhs)
+        {
+            return (lhs._start == rhs._start) && (lhs._end == rhs._end) && (lhs._step == rhs._step);
+        }
 
     private:
         int _start; /**< Start of the dimension */
@@ -198,15 +213,17 @@ public:
      */
     void shift(size_t dimension, int shift_value);
 
-    /** Shift down all the dimensions of a window
+    /** Shift down all the dimensions of a window starting from the specified dimension.
      *
-     * i.e new_dims[n] = old_dims[n+shift_value].
+     * new_dims[i] = old_dims[i]             for all i < start_dim.
+     * new_dims[i] = old_dims[i+shift_value] for all i >= start_dim.
      *
      * @param[in] shift_value Number of dimensions to shift the window by.
+     * @param[in] start_dim   The dimension from which the dimensions start to shift.
      *
      * @return The window with the shifted dimensions.
      */
-    Window shift_dimensions(unsigned int shift_value) const;
+    Window shift_dimensions(unsigned int shift_value, unsigned int start_dim = 0) const;
 
     /** Adjust the start or end of a given dimension by the given value
      *
@@ -346,7 +363,6 @@ public:
     {
         return slide_window_slice<4>(slice);
     }
-
     /** Collapse the dimensions between @p first and @p last if possible.
      *
      * A dimension is collapsable if it starts from 0 and matches the corresponding dimension in the full_window
@@ -358,7 +374,8 @@ public:
      *
      * @return Collapsed window.
      */
-    Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
+    Window
+    collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed = nullptr) const;
 
     /** Collapse the dimensions higher than @p first if possible.
      *
@@ -411,6 +428,14 @@ public:
      * @param[in] rhs Second window to swap.
      */
     friend void swap(Window &lhs, Window &rhs);
+    /** Check whether two Windows are equal.
+     *
+     * @param[in] lhs LHS window
+     * @param[in] rhs RHS window
+     *
+     * @return True if the given windows are the same.
+     */
+    friend bool operator==(const Window &lhs, const Window &rhs);
 
 private:
     /** First slice of the window
@@ -418,7 +443,7 @@ private:
      * @return The first slice of the window.
      */
     template <unsigned int window_dimension>
-    Window                 first_slice_window() const;
+    Window first_slice_window() const;
 
     /** Slide the passed window slice.
      *
@@ -437,4 +462,4 @@ private:
 };
 } // namespace arm_compute
 #include "Window.inl"
-#endif /*ARM_COMPUTE_WINDOW_H */
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_H
diff --git a/arm_compute/core/Window.inl b/arm_compute/core/Window.inl
index 70c4f80ac2..0f7c4fbdd7 100644
--- a/arm_compute/core/Window.inl
+++ b/arm_compute/core/Window.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2020, 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#ifndef ACL_ARM_COMPUTE_CORE_WINDOW_INL
+#define ACL_ARM_COMPUTE_CORE_WINDOW_INL
+
 namespace arm_compute
 {
 inline Window::Window(const Window &src)
     : _dims(), _is_broadcasted(utility::generate_array<bool, Coordinates::num_max_dimensions, false>::value)
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         set(i, src[i]);
         _is_broadcasted[i] = src.is_broadcasted(i);
@@ -65,32 +69,34 @@ inline bool Window::is_broadcasted(size_t dimension) const
     return _is_broadcasted[dimension];
 }
 
-inline Window Window::collapse_if_possible(const Window &full_window, const size_t first,
-                                           const size_t last, bool *has_collapsed) const
+inline Window Window::collapse_if_possible(const Window &full_window,
+                                           const size_t  first,
+                                           const size_t  last,
+                                           bool         *has_collapsed) const
 {
     Window collapsed(*this);
 
     bool is_collapsable = true;
     int  collapsed_end  = _dims[first].end();
 
-    for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+    for (size_t d = first + 1; is_collapsable && (d < last); ++d)
     {
         // The _dims's dimension must match the full _dims dimension to be collapsable:
-        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1)
-                         && (full_window[d].end() == _dims[d].end());
+        is_collapsable = (_dims[d].start() == 0) && (full_window[d].start() == 0) && (_dims[d].step() <= 1) &&
+                         (full_window[d].end() == _dims[d].end());
         collapsed_end *= _dims[d].end();
     }
 
-    if(is_collapsable)
+    if (is_collapsable)
     {
         collapsed._dims.at(first).set_end(collapsed_end);
-        for(size_t d = first + 1; is_collapsable && (d < last); ++d)
+        for (size_t d = first + 1; is_collapsable && (d < last); ++d)
         {
             collapsed.set(d, Dimension());
         }
     }
 
-    if(has_collapsed != nullptr)
+    if (has_collapsed != nullptr)
     {
         *has_collapsed = is_collapsable;
     }
@@ -98,13 +104,21 @@ inline Window Window::collapse_if_possible(const Window &full_window, const size
     return collapsed;
 }
 
-inline Window Window::shift_dimensions(unsigned int shift_value) const
+inline Window Window::shift_dimensions(unsigned int shift_value, unsigned int start_dim) const
 {
     Window shifted_window;
-    for(size_t n = 0; n < (Coordinates::num_max_dimensions - shift_value); n++)
+    size_t n = 0;
+
+    for (; n < start_dim; ++n)
+    {
+        shifted_window.set(n, _dims[n]);
+    }
+
+    for (; n < (Coordinates::num_max_dimensions - shift_value); n++)
     {
         shifted_window.set(n, _dims[n + shift_value]);
     }
+
     return shifted_window;
 }
 
@@ -120,9 +134,9 @@ inline Window Window::collapse(const Window &full_window, const size_t first, co
 inline Window Window::broadcast_if_dimension_le_one(const TensorShape &shape) const
 {
     Window broadcastWin(*this);
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
-        if(shape[d] <= 1)
+        if (shape[d] <= 1)
         {
             broadcastWin.set_broadcasted(d);
         }
@@ -142,7 +156,7 @@ inline void Window::adjust(size_t dimension, int adjust_value, bool is_at_start)
     ARM_COMPUTE_ERROR_ON(dimension >= Coordinates::num_max_dimensions);
     Window::Dimension &d = _dims[dimension];
 
-    if(is_at_start)
+    if (is_at_start)
     {
         d = Window::Dimension(d.start() + adjust_value, d.end(), d.step());
     }
@@ -172,7 +186,7 @@ inline void Window::set_dimension_step(size_t dimension, int step)
 
 inline void Window::validate() const
 {
-    for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(_dims[i].end() < _dims[i].start());
         ARM_COMPUTE_ERROR_ON((_dims[i].step() != 0) && (((_dims[i].end() - _dims[i].start()) % _dims[i].step()) != 0));
@@ -193,21 +207,21 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 
     Window out;
 
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
-        if(d == dimension)
+        if (d == dimension)
         {
-            int start        = _dims[d].start();
-            int end          = _dims[d].end();
-            const int step   = _dims[d].step();
+            int       start = _dims[d].start();
+            int       end   = _dims[d].end();
+            const int step  = _dims[d].step();
 
             const int num_it = num_iterations(d);
             const int rem    = num_it % total;
-            int work         = num_it / total;
+            int       work   = num_it / total;
 
-            int it_start     = work * id;
+            int it_start = work * id;
 
-            if(int(id) < rem)
+            if (int(id) < rem)
             {
                 ++work;
                 it_start += id;
@@ -234,18 +248,18 @@ inline Window Window::split_window(size_t dimension, size_t id, size_t total) co
 template <unsigned int window_dimension>
 inline bool Window::slide_window_slice(Window &slice) const
 {
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         // Did we reach the end of this dimension?
         const int v = slice._dims[n].start() + 1;
 
-        if(v < _dims[n].end())
+        if (v < _dims[n].end())
         {
             // No: increment
             slice._dims[n] = Dimension(v, v + 1, 1);
 
             // Reset lower dimensions:
-            for(unsigned int lower = window_dimension; lower < n; ++lower)
+            for (unsigned int lower = window_dimension; lower < n; ++lower)
             {
                 slice._dims[lower] = Dimension(_dims[lower].start(), _dims[lower].start() + 1, 1);
             }
@@ -258,14 +272,14 @@ inline bool Window::slide_window_slice(Window &slice) const
 }
 
 template <unsigned int window_dimension>
-inline Window          Window::first_slice_window() const
+inline Window Window::first_slice_window() const
 {
     Window slice;
 
     std::copy_n(_dims.begin(), window_dimension, slice._dims.begin());
 
     //Initialise higher dimensions to be the first slice.
-    for(unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
+    for (unsigned int n = window_dimension; n < Coordinates::num_max_dimensions; ++n)
     {
         slice._dims[n] = Dimension(_dims[n].start(), _dims[n].start() + 1, 1);
     }
@@ -275,16 +289,16 @@ inline Window          Window::first_slice_window() const
 
 inline void Window::use_tensor_dimensions(const TensorShape &shape, size_t first_dimension)
 {
-    for(unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
+    for (unsigned int n = first_dimension; n < shape.num_dimensions(); ++n)
     {
-        set(n, Window::Dimension(0, std::max(shape[n], static_cast<uint32_t>(1))));
+        set(n, Window::Dimension(0, std::max(shape[n], static_cast<size_t>(1))));
     }
 }
 
 inline TensorShape Window::shape() const
 {
     TensorShape shape;
-    for(size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (size_t d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         shape.set(d, (_dims[d].end() - _dims[d].start()) / _dims[d].step());
     }
@@ -294,7 +308,7 @@ inline TensorShape Window::shape() const
 inline size_t Window::num_iterations_total() const
 {
     size_t total = 1;
-    for(size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
+    for (size_t d = 0; d < Coordinates::num_max_dimensions; ++d)
     {
         total *= num_iterations(d);
     }
@@ -305,4 +319,11 @@ inline void swap(Window &lhs, Window &rhs)
 {
     lhs._dims.swap(rhs._dims);
 }
+
+inline bool operator==(const Window &lhs, const Window &rhs)
+{
+    return (lhs._dims == rhs._dims) && (lhs._is_broadcasted == rhs._is_broadcasted);
+}
 } // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_CORE_WINDOW_INL
diff --git a/arm_compute/core/WindowIterator.h b/arm_compute/core/WindowIterator.h
index e7d5334fa0..29302c410a 100644
--- a/arm_compute/core/WindowIterator.h
+++ b/arm_compute/core/WindowIterator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,10 +28,6 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
 
-//FIXME: Delete the "PRINTF" before the release. In the meantime it's probably going to be useful to debug
-//#define PRINTF printf
-#define PRINTF(...)
-
 namespace arm_compute
 {
 /** Convert an offset in window steps into absolute coordinates.
@@ -44,7 +40,7 @@ namespace arm_compute
 inline Coordinates convert_window_coord_to_position(const Window &w, const Coordinates &offset)
 {
     Coordinates position;
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         position.set(i, w[i].start() + offset[i] * w[i].step());
     }
@@ -168,16 +164,14 @@ public:
     template <typename M>
     void iterate_3D(M &&on_new_row_size)
     {
-        while(_end.z() != _position.z())
+        while (_end.z() != _position.z())
         {
-            PRINTF("New slice %d\n", _position.z());
             iterate_2D_internal(on_new_row_size, _w.x().end() - _w.x().step(), _w.y().end() - _w.y().step());
             _position[2] += _w.z().step();
             _position[1] = _w.y().start();
             _position[0] = _w.x().start();
         }
         // Left over:
-        PRINTF("Left over slice\n");
         iterate_2D(on_new_row_size);
     }
 
@@ -217,29 +211,25 @@ private:
     void iterate_2D_internal(M &&on_new_row_size, int end_x, int end_y)
     {
         //Is there more than one row to process ?
-        if(end_y == _position.y())
+        if (end_y == _position.y())
         {
-            // Single row:
-            PRINTF("Partial row only\n");
             // Both start and end belong to the same row:
             iterate_over_dim0(end_x + _w.x().step(), on_new_row_size);
         }
         else
         {
             // Do we start from the beginning of the row ?
-            if(_w.x().start() != _position.x())
+            if (_w.x().start() != _position.x())
             {
                 //Start in the middle of a row: process left-over X
-                PRINTF("Partial row first\n");
                 iterate_over_dim0(_w.x().end(), on_new_row_size);
                 _position[1] += _w.y().step();
             }
 
             //Middle rows
             bool no_leftover = end_x + _w.x().step() == _w.x().end();
-            if(no_leftover)
+            if (no_leftover)
             {
-                PRINTF("no left over\n");
                 //Switch to full row size:
                 on_new_row_size(_w[0].start(), _w.x().end());
                 // Shouldn't be possible to reach that point and not have at least one entire row to process
@@ -249,17 +239,14 @@ private:
             }
             else
             {
-                PRINTF("with left over\n");
                 // Are there full rows to process ?
-                if(_position[1] != end_y)
+                if (_position[1] != end_y)
                 {
-                    PRINTF("full rows\n");
                     //Switch to full row size:
                     on_new_row_size(_w[0].start(), _w.x().end());
                     iterate_over_dim1(end_y);
                 }
 
-                PRINTF("Final leftover\n");
                 //Leftover end x
                 _position[0] = _w.x().start();
                 iterate_over_dim0(end_x + _w.x().step(), on_new_row_size);
@@ -273,7 +260,7 @@ private:
      */
     void iterate_over_dim1(int end)
     {
-        for(; _position[1] != end; _position[1] += _w[1].step())
+        for (; _position[1] != end; _position[1] += _w[1].step())
         {
             _position[0] = _w[0].start();
             iterate_over_dim0(_w[0].end());
@@ -298,10 +285,9 @@ private:
      */
     void iterate_over_dim0(int end)
     {
-        PRINTF("X [%d, %d, %d]\n", _position.x(), end, _w[0].step());
         // Both start and end belong to the same row:
         ARM_COMPUTE_ERROR_ON(_position[0] > end);
-        for(; _position.x() < end; _position[0] += _w[0].step())
+        for (; _position.x() < end; _position[0] += _w[0].step())
         {
             _lambda_function(_position);
         }
@@ -323,9 +309,10 @@ private:
  * @return A WindowIterator object.
  */
 template <typename L>
-WindowIterator<L> create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
+WindowIterator<L>
+create_window_iterator(const Window &w, const Coordinates &start, const Coordinates &end, L &&lambda_function)
 {
     return WindowIterator<L>(w, start, end, std::move(lambda_function));
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_WINDOW_ITERATOR_H*/
diff --git a/arm_compute/core/experimental/Types.h b/arm_compute/core/experimental/Types.h
new file mode 100644
index 0000000000..63a3a1a1ec
--- /dev/null
+++ b/arm_compute/core/experimental/Types.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+#define ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorShape.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+// Forward declaration
+class ITensor;
+
+/** Memory type */
+enum TensorType : int32_t
+{
+    ACL_UNKNOWN = -1,
+    ACL_SRC_DST = 0,
+
+    // Src
+    ACL_SRC     = 0,
+    ACL_SRC_0   = 0,
+    ACL_SRC_1   = 1,
+    ACL_SRC_2   = 2,
+    ACL_SRC_3   = 3,
+    ACL_SRC_4   = 4,
+    ACL_SRC_5   = 5,
+    ACL_SRC_6   = 6,
+    ACL_SRC_END = 6,
+
+    // Dst
+    ACL_DST     = 30,
+    ACL_DST_0   = 30,
+    ACL_DST_1   = 31,
+    ACL_DST_2   = 32,
+    ACL_DST_END = 32,
+
+    // Aux
+    ACL_INT     = 50,
+    ACL_INT_0   = 50,
+    ACL_INT_1   = 51,
+    ACL_INT_2   = 52,
+    ACL_INT_3   = 53,
+    ACL_INT_4   = 54,
+    ACL_SRC_VEC = 256,
+    ACL_DST_VEC = 512,
+    ACL_INT_VEC = 1024,
+
+    // Aliasing Types
+    // Conv etc
+    ACL_BIAS = ACL_SRC_2,
+
+    // Gemm
+    ACL_VEC_ROW_SUM = ACL_SRC_3,
+    ACL_VEC_COL_SUM = ACL_SRC_4,
+    ACL_SHIFTS      = ACL_SRC_5,
+    ACL_MULTIPLIERS = ACL_SRC_6,
+};
+
+namespace experimental
+{
+enum class MemoryLifetime
+{
+    Temporary  = 0,
+    Persistent = 1,
+    Prepare    = 2,
+};
+struct MemoryInfo
+{
+    MemoryInfo() = default;
+
+    MemoryInfo(int slot, size_t size, size_t alignment = 0) noexcept : slot(slot), size(size), alignment(alignment)
+    {
+    }
+
+    MemoryInfo(int slot, MemoryLifetime lifetime, size_t size, size_t alignment = 0) noexcept
+        : slot(slot), lifetime(lifetime), size(size), alignment(alignment)
+    {
+    }
+
+    bool merge(int slot, size_t new_size, size_t new_alignment = 0) noexcept
+    {
+        if (slot != this->slot)
+        {
+            return false;
+        }
+
+        size      = std::max(size, new_size);
+        alignment = std::max(alignment, new_alignment);
+
+        return true;
+    }
+
+    int            slot{ACL_UNKNOWN};
+    MemoryLifetime lifetime{MemoryLifetime::Temporary};
+    size_t         size{0};
+    size_t         alignment{64};
+};
+
+using MemoryRequirements = std::vector<MemoryInfo>;
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_CORE_EXPERIMENTAL_TYPES_H
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h b/arm_compute/core/utils/ActivationFunctionUtils.h
index bb64bef1e9..c988efa256 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/qmov.h
+++ b/arm_compute/core/utils/ActivationFunctionUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,29 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_WRAPPER_QMOV_H
-#define ARM_COMPUTE_WRAPPER_QMOV_H
+#ifndef ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H
 
-#include <arm_neon.h>
+#include "arm_compute/core/Types.h"
 
-namespace arm_compute
-{
-namespace wrapper
-{
-template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
-{
-    return vqmovun_s16(a);
-}
+#include <string>
 
-template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+namespace arm_compute
 {
-    return vqmovn_s16(a);
-}
-
-} // namespace wrapper
+/** Translates a given activation function to a string.
+ *
+ * @param[in] act @ref ActivationLayerInfo::ActivationFunction to be translated to string.
+ *
+ * @return The string describing the activation function.
+ */
+const std::string &string_from_activation_func(const ActivationFunction &act);
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_QMOV_H */
+#endif /*ARM_COMPUTE_CORE_UTILS_ACTIVATIONFUNCTIONUTILS_H */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/utils/DataLayoutUtils.h
index 14e51d825c..61839c9f91 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/utils/DataLayoutUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <array>
-#include <limits>
+#ifndef ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H
+#include "arm_compute/core/Types.h"
 
-namespace arm_compute
-{
-#ifndef DOXYGEN_SKIP_THIS
+#include <string>
 
-inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
+namespace arm_compute
 {
-    float32x4x2_t res =
-    {
-        {
-            vmaxq_f32(a.val[0], b.val[0]),
-            vmaxq_f32(a.val[1], b.val[1])
-        }
-    };
-    return res;
-}
-#endif /* DOXYGEN_SKIP_THIS */
+/** Convert a data layout identity into a string.
+ *
+ * @param[in] dl @ref DataLayout to be translated to string.
+ *
+ * @return The string describing the data layout.
+ */
+const std::string &string_from_data_layout(DataLayout dl);
 } // namespace arm_compute
+#endif /*ARM_COMPUTE_CORE_UTILS_DATALAYOUTUTILS_H */
diff --git a/arm_compute/core/utils/DataTypeUtils.h b/arm_compute/core/utils/DataTypeUtils.h
new file mode 100644
index 0000000000..6fabb19b64
--- /dev/null
+++ b/arm_compute/core/utils/DataTypeUtils.h
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H
+#define ACL_ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+/** The size in bytes of the data type
+ *
+ * @param[in] data_type Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t data_size_from_type(DataType data_type)
+{
+    switch (data_type)
+    {
+        case DataType::U8:
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+        case DataType::BFLOAT16:
+        case DataType::F16:
+            return 2;
+        case DataType::F32:
+        case DataType::U32:
+        case DataType::S32:
+            return 4;
+        case DataType::F64:
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
+        case DataType::SIZET:
+            return sizeof(size_t);
+        default:
+            ARM_COMPUTE_ERROR("Invalid data type");
+            return 0;
+    }
+}
+
+/** The size in bytes of the data type
+ *
+ * @param[in] dt Input data type
+ *
+ * @return The size in bytes of the data type
+ */
+inline size_t element_size_from_data_type(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::S8:
+        case DataType::U8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+            return 1;
+        case DataType::U16:
+        case DataType::S16:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+        case DataType::BFLOAT16:
+        case DataType::F16:
+            return 2;
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            return 4;
+        case DataType::U64:
+        case DataType::S64:
+            return 8;
+        default:
+            ARM_COMPUTE_ERROR("Undefined element size for given data type");
+            return 0;
+    }
+}
+
+/** Return the data type used by a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline DataType data_type_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+        case Format::UV88:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return DataType::U8;
+        case Format::U16:
+            return DataType::U16;
+        case Format::S16:
+            return DataType::S16;
+        case Format::U32:
+            return DataType::U32;
+        case Format::S32:
+            return DataType::S32;
+        case Format::BFLOAT16:
+            return DataType::BFLOAT16;
+        case Format::F16:
+            return DataType::F16;
+        case Format::F32:
+            return DataType::F32;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Not supported data_type for given format");
+            return DataType::UNKNOWN;
+    }
+}
+
+/** Return the promoted data type of a given data type.
+ *
+ * @note If promoted data type is not supported an error will be thrown
+ *
+ * @param[in] dt Data type to get the promoted type of.
+ *
+ * @return Promoted data type
+ */
+inline DataType get_promoted_data_type(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::U8:
+            return DataType::U16;
+        case DataType::S8:
+            return DataType::S16;
+        case DataType::U16:
+            return DataType::U32;
+        case DataType::S16:
+            return DataType::S32;
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+        case DataType::BFLOAT16:
+        case DataType::F16:
+        case DataType::U32:
+        case DataType::S32:
+        case DataType::F32:
+            ARM_COMPUTE_ERROR("Unsupported data type promotions!");
+        default:
+            ARM_COMPUTE_ERROR("Undefined data type!");
+    }
+    return DataType::UNKNOWN;
+}
+
+/** Compute the mininum and maximum values a data type can take
+ *
+ * @param[in] dt Data type to get the min/max bounds of
+ *
+ * @return A tuple (min,max) with the minimum and maximum values respectively wrapped in PixelValue.
+ */
+inline std::tuple<PixelValue, PixelValue> get_min_max(DataType dt)
+{
+    PixelValue min{};
+    PixelValue max{};
+    switch (dt)
+    {
+        case DataType::U8:
+        case DataType::QASYMM8:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()));
+            break;
+        }
+        case DataType::S8:
+        case DataType::QSYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int8_t>::max()));
+            break;
+        }
+        case DataType::U16:
+        case DataType::QASYMM16:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<uint16_t>::max()));
+            break;
+        }
+        case DataType::S16:
+        case DataType::QSYMM16:
+        {
+            min = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::lowest()));
+            max = PixelValue(static_cast<int32_t>(std::numeric_limits<int16_t>::max()));
+            break;
+        }
+        case DataType::U32:
+        {
+            min = PixelValue(std::numeric_limits<uint32_t>::lowest());
+            max = PixelValue(std::numeric_limits<uint32_t>::max());
+            break;
+        }
+        case DataType::S32:
+        {
+            min = PixelValue(std::numeric_limits<int32_t>::lowest());
+            max = PixelValue(std::numeric_limits<int32_t>::max());
+            break;
+        }
+        case DataType::BFLOAT16:
+        {
+            min = PixelValue(bfloat16::lowest());
+            max = PixelValue(bfloat16::max());
+            break;
+        }
+        case DataType::F16:
+        {
+            min = PixelValue(std::numeric_limits<half>::lowest());
+            max = PixelValue(std::numeric_limits<half>::max());
+            break;
+        }
+        case DataType::F32:
+        {
+            min = PixelValue(std::numeric_limits<float>::lowest());
+            max = PixelValue(std::numeric_limits<float>::max());
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Undefined data type!");
+    }
+    return std::make_tuple(min, max);
+}
+
+/** Convert a data type identity into a string.
+ *
+ * @param[in] dt @ref DataType to be translated to string.
+ *
+ * @return The string describing the data type.
+ */
+const std::string &string_from_data_type(DataType dt);
+
+/** Convert a string to DataType
+ *
+ * @param[in] name The name of the data type
+ *
+ * @return DataType
+ */
+DataType data_type_from_name(const std::string &name);
+
+/** Input Stream operator for @ref DataType
+ *
+ * @param[in]  stream    Stream to parse
+ * @param[out] data_type Output data type
+ *
+ * @return Updated stream
+ */
+inline ::std::istream &operator>>(::std::istream &stream, DataType &data_type)
+{
+    std::string value;
+    stream >> value;
+    data_type = data_type_from_name(value);
+    return stream;
+}
+
+/** Check if a given data type is of floating point type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of floating point type, else false.
+ */
+inline bool is_data_type_float(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::F16:
+        case DataType::F32:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of quantized type
+ *
+ * @note Quantized is considered a super-set of fixed-point and asymmetric data types.
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of quantized type, else false.
+ */
+inline bool is_data_type_quantized(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QSYMM8:
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QSYMM8_PER_CHANNEL:
+        case DataType::QSYMM16:
+        case DataType::QASYMM16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of asymmetric quantized type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of asymmetric quantized type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QASYMM16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of asymmetric quantized signed type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of asymmetric quantized signed type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric_signed(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QASYMM8_SIGNED:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of 8-bit asymmetric quantized signed type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of 8-bit asymmetric quantized signed type, else false.
+ */
+inline bool is_data_type_quantized_asymmetric_char(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QASYMM8_SIGNED:
+        case DataType::QASYMM8:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of symmetric quantized type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of symmetric quantized type, else false.
+ */
+inline bool is_data_type_quantized_symmetric(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QSYMM8:
+        case DataType::QSYMM8_PER_CHANNEL:
+        case DataType::QSYMM16:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Check if a given data type is of per channel type
+ *
+ * @param[in] dt Input data type.
+ *
+ * @return True if data type is of per channel type, else false.
+ */
+inline bool is_data_type_quantized_per_channel(DataType dt)
+{
+    switch (dt)
+    {
+        case DataType::QSYMM8_PER_CHANNEL:
+            return true;
+        default:
+            return false;
+    }
+}
+
+/** Returns true if the value can be represented by the given data type
+ *
+ * @param[in] val   value to be checked
+ * @param[in] dt    data type that is checked
+ * @param[in] qinfo (Optional) quantization info if the data type is QASYMM8
+ *
+ * @return true if the data type can hold the value.
+ */
+template <typename T>
+bool check_value_range(T val, DataType dt, QuantizationInfo qinfo = QuantizationInfo())
+{
+    switch (dt)
+    {
+        case DataType::U8:
+        {
+            const auto val_u8 = static_cast<uint8_t>(val);
+            return ((val_u8 == val) && val >= std::numeric_limits<uint8_t>::lowest() &&
+                    val <= std::numeric_limits<uint8_t>::max());
+        }
+        case DataType::QASYMM8:
+        {
+            double min = static_cast<double>(dequantize_qasymm8(0, qinfo));
+            double max = static_cast<double>(dequantize_qasymm8(std::numeric_limits<uint8_t>::max(), qinfo));
+            return ((double)val >= min && (double)val <= max);
+        }
+        case DataType::S8:
+        {
+            const auto val_s8 = static_cast<int8_t>(val);
+            return ((val_s8 == val) && val >= std::numeric_limits<int8_t>::lowest() &&
+                    val <= std::numeric_limits<int8_t>::max());
+        }
+        case DataType::U16:
+        {
+            const auto val_u16 = static_cast<uint16_t>(val);
+            return ((val_u16 == val) && val >= std::numeric_limits<uint16_t>::lowest() &&
+                    val <= std::numeric_limits<uint16_t>::max());
+        }
+        case DataType::S16:
+        {
+            const auto val_s16 = static_cast<int16_t>(val);
+            return ((val_s16 == val) && val >= std::numeric_limits<int16_t>::lowest() &&
+                    val <= std::numeric_limits<int16_t>::max());
+        }
+        case DataType::U32:
+        {
+            const auto val_d64 = static_cast<double>(val);
+            const auto val_u32 = static_cast<uint32_t>(val);
+            return ((val_u32 == val_d64) && val_d64 >= std::numeric_limits<uint32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<uint32_t>::max());
+        }
+        case DataType::S32:
+        {
+            const auto val_d64 = static_cast<double>(val);
+            const auto val_s32 = static_cast<int32_t>(val);
+            return ((val_s32 == val_d64) && val_d64 >= std::numeric_limits<int32_t>::lowest() &&
+                    val_d64 <= std::numeric_limits<int32_t>::max());
+        }
+        case DataType::BFLOAT16:
+            return (val >= bfloat16::lowest() && val <= bfloat16::max());
+        case DataType::F16:
+            return (val >= std::numeric_limits<half>::lowest() && val <= std::numeric_limits<half>::max());
+        case DataType::F32:
+            return (val >= std::numeric_limits<float>::lowest() && val <= std::numeric_limits<float>::max());
+        default:
+            ARM_COMPUTE_ERROR("Data type not supported");
+            return false;
+    }
+}
+
+/** Returns the suffix string of CPU kernel implementation names based on the given data type
+ *
+ * @param[in] data_type The data type the CPU kernel implemetation uses
+ *
+ * @return the suffix string of CPU kernel implementations
+ */
+inline std::string cpu_impl_dt(const DataType &data_type)
+{
+    std::string ret = "";
+
+    switch (data_type)
+    {
+        case DataType::F32:
+            ret = "fp32";
+            break;
+        case DataType::F16:
+            ret = "fp16";
+            break;
+        case DataType::U8:
+            ret = "u8";
+            break;
+        case DataType::S16:
+            ret = "s16";
+            break;
+        case DataType::S32:
+            ret = "s32";
+            break;
+        case DataType::QASYMM8:
+            ret = "qu8";
+            break;
+        case DataType::QASYMM8_SIGNED:
+            ret = "qs8";
+            break;
+        case DataType::QSYMM16:
+            ret = "qs16";
+            break;
+        case DataType::QSYMM8_PER_CHANNEL:
+            ret = "qp8";
+            break;
+        case DataType::BFLOAT16:
+            ret = "bf16";
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported.");
+    }
+
+    return ret;
+}
+
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_CORE_UTILS_DATATYPEUTILS_H
diff --git a/arm_compute/core/utils/FormatUtils.h b/arm_compute/core/utils/FormatUtils.h
new file mode 100644
index 0000000000..a8e96bd361
--- /dev/null
+++ b/arm_compute/core/utils/FormatUtils.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** The size in bytes of the pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The size in bytes of the pixel format
+ */
+inline size_t pixel_size_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+            return 1;
+        case Format::U16:
+        case Format::S16:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::UV88:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        case Format::U32:
+        case Format::S32:
+        case Format::F32:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            ARM_COMPUTE_ERROR("Undefined pixel size for given format");
+            return 0;
+    }
+}
+
+/** Return the plane index of a given channel given an input format.
+ *
+ * @param[in] format  Input format
+ * @param[in] channel Input channel
+ *
+ * @return The plane index of the specific channel of the specific format
+ */
+inline int plane_idx_from_channel(Format format, Channel channel)
+{
+    switch (format)
+    {
+        // Single planar formats have a single plane
+        case Format::U8:
+        case Format::U16:
+        case Format::S16:
+        case Format::U32:
+        case Format::S32:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::F32:
+        case Format::UV88:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 0;
+        // Multi planar formats
+        case Format::NV12:
+        case Format::NV21:
+        {
+            // Channel U and V share the same plane of format UV88
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                case Channel::V:
+                    return 1;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::IYUV:
+        case Format::YUV444:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the channel index of a given channel given an input format.
+ *
+ * @param[in] format  Input format
+ * @param[in] channel Input channel
+ *
+ * @return The channel index of the specific channel of the specific format
+ */
+inline int channel_idx_from_format(Format format, Channel channel)
+{
+    switch (format)
+    {
+        case Format::RGB888:
+        {
+            switch (channel)
+            {
+                case Channel::R:
+                    return 0;
+                case Channel::G:
+                    return 1;
+                case Channel::B:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::RGBA8888:
+        {
+            switch (channel)
+            {
+                case Channel::R:
+                    return 0;
+                case Channel::G:
+                    return 1;
+                case Channel::B:
+                    return 2;
+                case Channel::A:
+                    return 3;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::YUYV422:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 3;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::UYVY422:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 1;
+                case Channel::U:
+                    return 0;
+                case Channel::V:
+                    return 2;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::NV12:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 0;
+                case Channel::V:
+                    return 1;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::NV21:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 1;
+                case Channel::V:
+                    return 0;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        case Format::YUV444:
+        case Format::IYUV:
+        {
+            switch (channel)
+            {
+                case Channel::Y:
+                    return 0;
+                case Channel::U:
+                    return 0;
+                case Channel::V:
+                    return 0;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported channel");
+                    return 0;
+            }
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of planes for a given format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of planes for a given image format.
+ */
+inline size_t num_planes_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+        case Format::S16:
+        case Format::U16:
+        case Format::S32:
+        case Format::U32:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::F32:
+        case Format::RGB888:
+        case Format::RGBA8888:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 1;
+        case Format::NV12:
+        case Format::NV21:
+            return 2;
+        case Format::IYUV:
+        case Format::YUV444:
+            return 3;
+        default:
+            ARM_COMPUTE_ERROR("Not supported format");
+            return 0;
+    }
+}
+
+/** Return the number of channels for a given single-planar pixel format
+ *
+ * @param[in] format Input format
+ *
+ * @return The number of channels for a given image format.
+ */
+inline size_t num_channels_from_format(Format format)
+{
+    switch (format)
+    {
+        case Format::U8:
+        case Format::U16:
+        case Format::S16:
+        case Format::U32:
+        case Format::S32:
+        case Format::BFLOAT16:
+        case Format::F16:
+        case Format::F32:
+            return 1;
+        // Because the U and V channels are subsampled
+        // these formats appear like having only 2 channels:
+        case Format::YUYV422:
+        case Format::UYVY422:
+            return 2;
+        case Format::UV88:
+            return 2;
+        case Format::RGB888:
+            return 3;
+        case Format::RGBA8888:
+            return 4;
+        //Doesn't make sense for planar formats:
+        case Format::NV12:
+        case Format::NV21:
+        case Format::IYUV:
+        case Format::YUV444:
+        default:
+            return 0;
+    }
+}
+
+/** Convert a tensor format into a string.
+ *
+ * @param[in] format @ref Format to be translated to string.
+ *
+ * @return The string describing the format.
+ */
+const std::string &string_from_format(Format format);
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CORE_UTILS_FORMATUTILS_H */
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/utils/InterpolationPolicyUtils.h
index 3aff677385..8d4ae4321c 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/utils/InterpolationPolicyUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,21 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEFIXEDPOINT_H
-#define ARM_COMPUTE_NEFIXEDPOINT_H
+#ifndef ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H
 
-#include <arm_neon.h>
+#include "arm_compute/core/Types.h"
+
+#include <string>
 
 namespace arm_compute
 {
-/** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
+/** Translates a given interpolation policy to a string.
  *
- * @param[in] a Float input vector
- * @param[in] b Float input vector
+ * @param[in] policy @ref InterpolationPolicy to be translated to string.
  *
- * @return The lane-by-lane maximum -> float32x4x2
+ * @return The string describing the interpolation policy.
  */
-float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
+const std::string &string_from_interpolation_policy(InterpolationPolicy policy);
 } // namespace arm_compute
-#include "arm_compute/core/NEON/NEFixedPoint.inl"
-#endif /* ARM_COMPUTE_NEFIXEDPOINT_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CORE_UTILS_INTERPOLATIONPOLICYUTILS_H */
diff --git a/arm_compute/core/GLES_COMPUTE/GCHelpers.h b/arm_compute/core/utils/StringUtils.h
index b1a9ab32be..c13cbaa334 100644
--- a/arm_compute/core/GLES_COMPUTE/GCHelpers.h
+++ b/arm_compute/core/utils/StringUtils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,38 +21,45 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GCHELPERS_H
-#define ARM_COMPUTE_GCHELPERS_H
+#ifndef ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H
+#define ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H
 
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/GPUTarget.h"
-#include "arm_compute/core/Helpers.h"
-
-#include <set>
 #include <string>
+#include <vector>
 
 namespace arm_compute
 {
-// Forward declarations
-class GCCoreRuntimeContext;
+/** Lower a given string.
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The lowered string
+ */
+std::string lower_string(const std::string &val);
 
-/** Max vector width of an GLES vector */
-static constexpr unsigned int max_gc_vector_width = 16;
+/** Raise a given string to upper case
+ *
+ * @param[in] val Given string to lower.
+ *
+ * @return The upper case string
+ */
+std::string upper_string(const std::string &val);
 
-/** Helper function to get the GPU target from GLES using GL_RENDERER enum
+/** Create a string with the float in full precision.
  *
- * @return the GPU target
+ * @param val Floating point value
+ *
+ * @return String with the floating point value.
  */
-GPUTarget get_target_from_device();
-/** Creates an GLES kernel
+std::string float_to_string_with_full_precision(float val);
+
+/** Join a sequence of strings with separator @p sep
  *
- * @param[in] ctx         A context to be used to create the GLES kernel.
- * @param[in] kernel_name The kernel name.
- * @param[in] build_opts  The build options to be used for the GLES kernel compilation.
+ * @param[in] strings Strings to join
+ * @param[in] sep     Separator to join consecutive strings in the sequence
  *
- * @return A GLES kernel
+ * @return std::string
  */
-GCKernel create_opengl_kernel(GCCoreRuntimeContext *ctx, const std::string &kernel_name, const std::set<std::string> &build_opts);
+std::string join(const std::vector<std::string> strings, const std::string &sep);
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GCHELPERS_H */
+#endif /*ARM_COMPUTE_CORE_UTILS_STRINGUTILS_H */
diff --git a/arm_compute/graph/backends/GLES/GCNodeValidator.h b/arm_compute/core/utils/helpers/AdjustVecSize.h
index 89421f7860..842e3b57d6 100644
--- a/arm_compute/graph/backends/GLES/GCNodeValidator.h
+++ b/arm_compute/core/utils/helpers/AdjustVecSize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,32 +21,35 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_GCNODEVALIDATOR_H
-#define ARM_COMPUTE_GRAPH_GCNODEVALIDATOR_H
+#ifndef ARM_COMPUTE_UTILS_ADJUSTVECSIZE_H
+#define ARM_COMPUTE_UTILS_ADJUSTVECSIZE_H
 
 #include "arm_compute/core/Error.h"
 
 namespace arm_compute
 {
-namespace graph
+/** Returns the adjusted vector size in case it is less than the input's first dimension, getting rounded down to its closest valid vector size
+ *
+ * @param[in] vec_size vector size to be adjusted
+ * @param[in] dim0     size of the first dimension
+ *
+ * @return the number of element processed along the X axis per thread
+ */
+inline unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
 {
-// Forward declarations
-class INode;
+    ARM_COMPUTE_ERROR_ON(vec_size > 16);
 
-namespace backends
-{
-class GCNodeValidator final
-{
-public:
-    /** Validate a node
-     *
-     * @param[in] node Node to validate
-     *
-     * @return An error status
-     */
-    static Status validate(INode *node);
-};
-} // namespace backends
-} // namespace graph
+    if ((vec_size >= dim0) && (dim0 == 3))
+    {
+        return dim0;
+    }
+
+    while (vec_size > dim0)
+    {
+        vec_size >>= 1;
+    }
+
+    return vec_size;
+}
 } // namespace arm_compute
-#endif //ARM_COMPUTE_GRAPH_GCNODEVALIDATOR_H
+#endif /*ARM_COMPUTE_UTILS_H */
diff --git a/arm_compute/core/utils/helpers/bit_ops.h b/arm_compute/core/utils/helpers/bit_ops.h
deleted file mode 100644
index 6dbca179e7..0000000000
--- a/arm_compute/core/utils/helpers/bit_ops.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
-#define ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H
-
-#include "arm_compute/core/utils/misc/Requires.h"
-
-#include <type_traits>
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace bit_ops
-{
-/** Checks if the idx-th bit is set in an integral type
- *
- * @param[in] v   Integral input
- * @param[in] idx Index of the bit to check
- *
- * @return True if the idx-th bit is set else false
- */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
-bool is_bit_set(T v, unsigned int idx)
-{
-    return (v & 1 << idx) != 0;
-}
-} // namespace bit_ops
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_BIT_OPS_H */
diff --git a/arm_compute/core/utils/helpers/fft.h b/arm_compute/core/utils/helpers/fft.h
deleted file mode 100644
index b22bece73f..0000000000
--- a/arm_compute/core/utils/helpers/fft.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_FFT_H
-#define ARM_COMPUTE_UTILS_HELPERS_FFT_H
-
-#include <set>
-#include <vector>
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace fft
-{
-/** Decompose a given 1D input size using the provided supported factors.
- *
- * @param[in] N                 Input size to be decomposed.
- * @param[in] supported_factors Supported factors that can be used for decomposition.
- *
- * @return A vector with the stages of the decomposition. Will be empty if decomposition failed.
- */
-std::vector<unsigned int> decompose_stages(unsigned int N, const std::set<unsigned int> &supported_factors);
-/** Calculate digit reverse index vector given fft size and the decomposed stages
- *
- * @param N          Input size to calculate digit reverse for
- * @param fft_stages A vector with the FFT decomposed stages
- *
- * @return A vector with the digit reverse indices. Will be empty if it failed.
- */
-std::vector<unsigned int> digit_reverse_indices(unsigned int N, const std::vector<unsigned int> &fft_stages);
-} // namespace fft
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_FFT_H */
diff --git a/arm_compute/core/utils/helpers/float_ops.h b/arm_compute/core/utils/helpers/float_ops.h
deleted file mode 100644
index fceee2e3fe..0000000000
--- a/arm_compute/core/utils/helpers/float_ops.h
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
-#define ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace float_ops
-{
-union RawFloat
-{
-    /** Constructor
-     *
-     * @param[in] val Floating-point value
-     */
-    explicit RawFloat(float val)
-        : f32(val)
-    {
-    }
-    /** Extract sign of floating point number
-     *
-     * @return Sign of floating point number
-     */
-    int32_t sign() const
-    {
-        return i32 >> 31;
-    }
-    /** Extract exponent of floating point number
-     *
-     * @return Exponent of floating point number
-     */
-    int32_t exponent() const
-    {
-        return (i32 >> 23) & 0xFF;
-    }
-    /** Extract mantissa of floating point number
-     *
-     * @return Mantissa of floating point number
-     */
-    int32_t mantissa() const
-    {
-        return i32 & 0x007FFFFF;
-    }
-
-    int32_t i32;
-    float   f32;
-};
-
-/** Checks if two floating point numbers are equal given an allowed number of ULPs
- *
- * @param[in] a                First number to compare
- * @param[in] b                Second number to compare
- * @param[in] max_allowed_ulps (Optional) Number of allowed ULPs
- *
- * @return True if number is close else false
- */
-inline bool is_equal_ulps(float a, float b, int max_allowed_ulps = 0)
-{
-    RawFloat ra(a);
-    RawFloat rb(b);
-
-    // Check ULP distance
-    const int ulps = std::abs(ra.i32 - rb.i32);
-    return ulps <= max_allowed_ulps;
-}
-
-/** Checks if the input floating point number is 1.0f checking if the difference is within a range defined with epsilon
- *
- * @param[in] a       Input floating point number
- * @param[in] epsilon (Optional) Epsilon used to define the error bounds
- *
- * @return True if number is close to 1.0f
- */
-inline bool is_one(float a, float epsilon = 0.00001f)
-{
-    return std::abs(1.0f - a) <= epsilon;
-}
-
-/** Checks if the input floating point number is 0.0f checking if the difference is within a range defined with epsilon
- *
- * @param[in] a       Input floating point number
- * @param[in] epsilon (Optional) Epsilon used to define the error bounds
- *
- * @return True if number is close to 0.0f
- */
-inline bool is_zero(float a, float epsilon = 0.00001f)
-{
-    return std::abs(0.0f - a) <= epsilon;
-}
-} // namespace float_ops
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_FLOAT_OPS_H */
diff --git a/arm_compute/core/utils/helpers/tensor_info.h b/arm_compute/core/utils/helpers/tensor_info.h
deleted file mode 100644
index da24e82f5a..0000000000
--- a/arm_compute/core/utils/helpers/tensor_info.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H
-#define ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H
-
-#include "arm_compute/core/ITensorInfo.h"
-
-namespace arm_compute
-{
-namespace helpers
-{
-namespace tensor_info
-{
-/** Checks if the quantization info of given tensors are different
- *
- * @param tensor_info_1 Tensor info of the first tensor
- * @param tensor_info_2 Tensor info of the second tensor
- * @param tensor_infos  Tensor infos of the rest tensors
- *
- * @return True if tensors have mismatching quantization info else false.
- */
-template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
-{
-    const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
-
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    });
-}
-} // namespace tensor_info
-} // namespace helpers
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_INFO_H */
diff --git a/arm_compute/core/utils/helpers/tensor_transform.h b/arm_compute/core/utils/helpers/tensor_transform.h
index 7e912a6f0a..7a61fa192a 100644
--- a/arm_compute/core/utils/helpers/tensor_transform.h
+++ b/arm_compute/core/utils/helpers/tensor_transform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -52,7 +52,8 @@ int calculate_stride_on_index(int index, Coordinates strides);
  *
  * @return Absolute start position of a given index
  */
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask);
 
 /** Returns the absolute end position of a given index for a strided slice operation
  *
@@ -68,8 +69,13 @@ int calculate_start_on_index(TensorShape input_shape, int index, Coordinates sta
  *
  * @return Absolute end position of a given index
  */
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index, Coordinates ends, Coordinates strides,
-                           int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask         = 0,
+                           int32_t     shrink_axis_mask = 0);
 
 /** Calculate start, end and stride coordinates for a strided slice
  *
@@ -87,8 +93,12 @@ int calculate_end_on_index(TensorShape input_shape, int index, int start_on_inde
  * @return A tuple with <Start,End,Strides>
  */
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask       = 0,
+                                                                                 int32_t     end_mask         = 0,
+                                                                                 int32_t     shrink_axis_mask = 0);
 
 /** Computes output shape of strided slice
  *
@@ -109,9 +119,14 @@ std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords
  *
  * @return The output tensor shape
  */
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0,
-                                               bool return_unshrinked = false);
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask        = 0,
+                                               int32_t     end_mask          = 0,
+                                               int32_t     shrink_axis_mask  = 0,
+                                               bool        return_unshrinked = false);
 
 /** Constructs end mask in case we want to perform a slice operation using the strided slice interface
  *
@@ -122,7 +137,7 @@ TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordina
  * @return End mask
  */
 int32_t construct_slice_end_mask(Coordinates ends);
-} // namespace tensor_tranform
+} // namespace tensor_transform
 } // namespace helpers
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_HELPERS_TENSOR_TRANSFORM_H */
diff --git a/arm_compute/core/utils/io/FileHandler.h b/arm_compute/core/utils/io/FileHandler.h
index ebc2ef06c1..615651d5b1 100644
--- a/arm_compute/core/utils/io/FileHandler.h
+++ b/arm_compute/core/utils/io/FileHandler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/logging/FilePrinter.h b/arm_compute/core/utils/logging/FilePrinter.h
index 73a5421ed4..a865aadddb 100644
--- a/arm_compute/core/utils/logging/FilePrinter.h
+++ b/arm_compute/core/utils/logging/FilePrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 #define ARM_COMPUTE_LOGGING_FILE_PRINTER_H
 
-#include "arm_compute/core/utils/logging/IPrinter.h"
-
 #include "arm_compute/core/utils/io/FileHandler.h"
+#include "arm_compute/core/utils/logging/IPrinter.h"
 
 namespace arm_compute
 {
diff --git a/arm_compute/core/utils/logging/Helpers.h b/arm_compute/core/utils/logging/Helpers.h
index 341f944ddc..c3c2f0f0b8 100644
--- a/arm_compute/core/utils/logging/Helpers.h
+++ b/arm_compute/core/utils/logging/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_LOGGING_HELPERS_H
 
 #include "arm_compute/core/utils/logging/Types.h"
-#include "support/MemorySupport.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cstddef>
@@ -46,10 +46,10 @@ namespace logging
  * @return The formatted string
  */
 template <typename... Ts>
-inline std::string string_with_format(const std::string &fmt, Ts &&... args)
+inline std::string string_with_format(const std::string &fmt, Ts &&...args)
 {
     size_t size     = support::cpp11::snprintf(nullptr, 0, fmt.c_str(), args...) + 1;
-    auto   char_str = support::cpp14::make_unique<char[]>(size);
+    auto   char_str = std::make_unique<char[]>(size);
     support::cpp11::snprintf(char_str.get(), size, fmt.c_str(), args...);
     return std::string(char_str.get(), char_str.get() + size - 1);
 }
diff --git a/arm_compute/core/utils/logging/IPrinter.h b/arm_compute/core/utils/logging/IPrinter.h
index b6ede5853a..7fde4d9302 100644
--- a/arm_compute/core/utils/logging/IPrinter.h
+++ b/arm_compute/core/utils/logging/IPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,7 @@ class Printer
 {
 public:
     /** Default Constructor */
-    Printer() noexcept
-        : _mtx()
+    Printer() noexcept : _mtx()
     {
     }
     /** Prevent instances of this class from being copied */
diff --git a/arm_compute/core/utils/logging/LogMsgDecorators.h b/arm_compute/core/utils/logging/LogMsgDecorators.h
index 08abcb4519..66a8180e21 100644
--- a/arm_compute/core/utils/logging/LogMsgDecorators.h
+++ b/arm_compute/core/utils/logging/LogMsgDecorators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,8 +63,7 @@ public:
      *
      * @param str Sting to append
      */
-    StringDecorator(const std::string &str)
-        : _str(str)
+    StringDecorator(const std::string &str) : _str(str)
     {
         _str = angle_wrap_value(str);
     }
@@ -103,7 +102,7 @@ private:
         auto time = std::chrono::system_clock::to_time_t(now);
 
         // TODO: use put_time for gcc > 4.9
-        char buf[100] = { 0 };
+        char buf[100] = {0};
         std::strftime(buf, sizeof(buf), "%d-%m-%Y %I:%M:%S", std::localtime(&time));
         return buf;
     }
diff --git a/arm_compute/core/utils/logging/Logger.h b/arm_compute/core/utils/logging/Logger.h
index 2bd467ae2b..608db39138 100644
--- a/arm_compute/core/utils/logging/Logger.h
+++ b/arm_compute/core/utils/logging/Logger.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,7 +88,7 @@ public:
      * @param[in] args      Message arguments
      */
     template <typename... Ts>
-    void log(LogLevel log_level, const std::string &fmt, Ts &&... args);
+    void log(LogLevel log_level, const std::string &fmt, Ts &&...args);
     /** Sets log level of the logger
      *
      * @warning Not thread-safe
@@ -159,11 +159,11 @@ private:
 };
 
 template <typename... Ts>
-inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&... args)
+inline void Logger::log(LogLevel log_level, const std::string &fmt, Ts &&...args)
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
diff --git a/arm_compute/core/utils/logging/LoggerRegistry.h b/arm_compute/core/utils/logging/LoggerRegistry.h
index c1a182c1ae..4e52a10935 100644
--- a/arm_compute/core/utils/logging/LoggerRegistry.h
+++ b/arm_compute/core/utils/logging/LoggerRegistry.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/logging/Logger.h"
 #include "arm_compute/core/utils/logging/Printers.h"
 #include "arm_compute/core/utils/logging/Types.h"
+
 #include "support/Mutex.h"
 
 #include <memory>
@@ -54,8 +55,9 @@ public:
      * @param[in] log_level Logger's log level. Defaults to INFO
      * @param[in] printers  Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
-    void create_logger(const std::string &name, LogLevel log_level = LogLevel::INFO,
-                       const std::vector<std::shared_ptr<Printer>> &printers = { std::make_shared<StdPrinter>() });
+    void create_logger(const std::string                           &name,
+                       LogLevel                                     log_level = LogLevel::INFO,
+                       const std::vector<std::shared_ptr<Printer>> &printers  = {std::make_shared<StdPrinter>()});
     /** Remove a logger
      *
      * @param name Logger's name
@@ -74,16 +76,17 @@ public:
      * @param[in] printers  (Optional) Printers to attach to the system loggers. Defaults with a @ref StdPrinter.
      */
     void create_reserved_loggers(LogLevel                                     log_level = LogLevel::INFO,
-                                 const std::vector<std::shared_ptr<Printer>> &printers  = { std::make_shared<StdPrinter>() });
+                                 const std::vector<std::shared_ptr<Printer>> &printers  = {
+                                      std::make_shared<StdPrinter>()});
 
 private:
     /** Default constructor */
     LoggerRegistry();
 
 private:
-    arm_compute::Mutex _mtx;
+    arm_compute::Mutex                                       _mtx;
     std::unordered_map<std::string, std::shared_ptr<Logger>> _loggers;
-    static std::set<std::string> _reserved_loggers;
+    static std::set<std::string>                             _reserved_loggers;
 };
 } // namespace logging
 } // namespace arm_compute
diff --git a/arm_compute/core/utils/logging/Macros.h b/arm_compute/core/utils/logging/Macros.h
index e4d9734792..4d5aa5fe2c 100644
--- a/arm_compute/core/utils/logging/Macros.h
+++ b/arm_compute/core/utils/logging/Macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,44 +30,71 @@
 
 #ifdef ARM_COMPUTE_LOGGING_ENABLED
 
+#ifdef __GNUC__
+inline std::string signature_name(const std::string &pretty_func)
+{
+    const auto scope_op = pretty_func.find("::");
+    const auto begin    = pretty_func.substr(0, scope_op).rfind(" ") + 1;
+    const auto end      = pretty_func.rfind("(") - begin;
+
+    return pretty_func.substr(begin, end) + "()";
+}
+#define ARM_COMPUTE_SIGNATURE_NAME signature_name(__PRETTY_FUNCTION__)
+#else /* __GNUC__ */
+#define ARM_COMPUTE_SIGNATURE_NAME (__func__)
+#endif /* __GNUC__ */
+
 #define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)                                 \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             __logger->log(log_level, msg);                                               \
         }                                                                                \
-    } while(false)
+    } while (false)
+
+#define ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME(logger_name, log_level, msg)                   \
+    do                                                                                   \
+    {                                                                                    \
+        auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
+        if (__logger != nullptr)                                                         \
+        {                                                                                \
+            std::ostringstream s;                                                        \
+            s << ARM_COMPUTE_SIGNATURE_NAME << " : " << msg;                             \
+            __logger->log(log_level, s.str());                                           \
+        }                                                                                \
+    } while (false)
 
 #define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)                     \
     do                                                                                        \
     {                                                                                         \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name);      \
-        if(__logger != nullptr)                                                               \
+        if (__logger != nullptr)                                                              \
         {                                                                                     \
             size_t size     = ::snprintf(nullptr, 0, fmt, __VA_ARGS__) + 1;                   \
-            auto   char_str = support::cpp14::make_unique<char[]>(size);                      \
-            ::snprintf(char_str.get(), size, #fmt, __VA_ARGS__);                              \
+            auto   char_str = std::make_unique<char[]>(size);                                 \
+            ::snprintf(char_str.get(), size, fmt, __VA_ARGS__);                               \
             __logger->log(log_level, std::string(char_str.get(), char_str.get() + size - 1)); \
         }                                                                                     \
-    } while(false)
+    } while (false)
 
 #define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)                           \
     do                                                                                   \
     {                                                                                    \
         auto __logger = arm_compute::logging::LoggerRegistry::get().logger(logger_name); \
-        if(__logger != nullptr)                                                          \
+        if (__logger != nullptr)                                                         \
         {                                                                                \
             std::ostringstream s;                                                        \
             s << stream;                                                                 \
             __logger->log(log_level, s.str());                                           \
         }                                                                                \
-    } while(false)
+    } while (false)
 
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 
 #define ARM_COMPUTE_LOG_MSG(logger_name, log_level, msg)
+#define ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME(logger_name, log_level, msg)
 #define ARM_COMPUTE_LOG_MSG_WITH_FORMAT(logger_name, log_level, fmt, ...)
 #define ARM_COMPUTE_LOG_STREAM(logger_name, log_level, stream)
 
diff --git a/arm_compute/core/utils/logging/Printers.h b/arm_compute/core/utils/logging/Printers.h
index e09880cc53..80493e7052 100644
--- a/arm_compute/core/utils/logging/Printers.h
+++ b/arm_compute/core/utils/logging/Printers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/logging/StdPrinter.h b/arm_compute/core/utils/logging/StdPrinter.h
index ea41ce2599..eb0e78ee84 100644
--- a/arm_compute/core/utils/logging/StdPrinter.h
+++ b/arm_compute/core/utils/logging/StdPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/core/utils/logging/Types.h b/arm_compute/core/utils/logging/Types.h
index 838adf95b4..64c567b984 100644
--- a/arm_compute/core/utils/logging/Types.h
+++ b/arm_compute/core/utils/logging/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,8 +44,7 @@ enum class LogLevel
 struct LogMsg
 {
     /** Default constructor */
-    LogMsg()
-        : raw_(), log_level_(LogLevel::OFF)
+    LogMsg() : raw_(), log_level_(LogLevel::OFF)
     {
     }
     /** Construct a log message
@@ -53,8 +52,7 @@ struct LogMsg
      * @param[in] msg       Message to log.
      * @param[in] log_level Logging level. Default: OFF
      */
-    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF)
-        : raw_(msg), log_level_(log_level)
+    LogMsg(std::string msg, LogLevel log_level = LogLevel::OFF) : raw_(msg), log_level_(log_level)
     {
     }
 
diff --git a/arm_compute/core/utils/math/Math.h b/arm_compute/core/utils/math/Math.h
new file mode 100644
index 0000000000..e70337ba0f
--- /dev/null
+++ b/arm_compute/core/utils/math/Math.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017-2018, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_UTILS_MATH_H
+#define ARM_COMPUTE_UTILS_MATH_H
+
+namespace arm_compute
+{
+/** Calculate the rounded up quotient of val / m.
+ *
+ * @param[in] val Value to divide and round up.
+ * @param[in] m   Value to divide by.
+ *
+ * @return the result.
+ */
+template <typename S, typename T>
+constexpr auto DIV_CEIL(S val, T m) -> decltype((val + m - 1) / m)
+{
+    return (val + m - 1) / m;
+}
+
+/** Computes the smallest number larger or equal to value that is a multiple of divisor.
+ *
+ * @param[in] value   Lower bound value
+ * @param[in] divisor Value to compute multiple of.
+ *
+ * @return the result.
+ */
+template <typename S, typename T>
+inline auto ceil_to_multiple(S value, T divisor) -> decltype(((value + divisor - 1) / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return DIV_CEIL(value, divisor) * divisor;
+}
+
+/** Computes the largest number smaller or equal to value that is a multiple of divisor.
+ *
+ * @param[in] value   Upper bound value
+ * @param[in] divisor Value to compute multiple of.
+ *
+ * @return the result.
+ */
+template <typename S, typename T>
+inline auto floor_to_multiple(S value, T divisor) -> decltype((value / divisor) * divisor)
+{
+    ARM_COMPUTE_ERROR_ON(value < 0 || divisor <= 0);
+    return (value / divisor) * divisor;
+}
+
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_UTILS_MATH_H */
diff --git a/arm_compute/core/utils/math/SafeOps.h b/arm_compute/core/utils/math/SafeOps.h
index 41bbb12e70..ef8bcf7e14 100644
--- a/arm_compute/core/utils/math/SafeOps.h
+++ b/arm_compute/core/utils/math/SafeOps.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,10 @@
 #define ARM_COMPUTE_UTILS_MATH_SAFE_OPS
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+
+#include "support/AclRequires.h"
+
+#include <limits>
 
 namespace arm_compute
 {
@@ -44,16 +47,16 @@ namespace math
  *
  * @return The addition result
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_add(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
+    if ((val_b > 0) && (val_a > std::numeric_limits<T>::max() - val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
+    else if ((val_b < 0) && (val_a < std::numeric_limits<T>::min() - val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -76,16 +79,16 @@ T safe_integer_add(T val_a, T val_b)
  *
  * @return The subtraction result
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_sub(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
+    if ((val_b < 0) && (val_a > std::numeric_limits<T>::max() + val_b))
     {
         result = std::numeric_limits<T>::max();
     }
-    else if((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
+    else if ((val_b > 0) && (val_a < std::numeric_limits<T>::min() + val_b))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -108,18 +111,18 @@ T safe_integer_sub(T val_a, T val_b)
  *
  * @return The multiplication result
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_mul(T val_a, T val_b)
 {
     T result = 0;
 
-    if(val_a > 0)
+    if (val_a > 0)
     {
-        if((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
+        if ((val_b > 0) && (val_a > (std::numeric_limits<T>::max() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if(val_b < (std::numeric_limits<T>::min() / val_a))
+        else if (val_b < (std::numeric_limits<T>::min() / val_a))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -130,11 +133,11 @@ T safe_integer_mul(T val_a, T val_b)
     }
     else
     {
-        if((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
+        if ((val_b > 0) && (val_a < (std::numeric_limits<T>::min() / val_b)))
         {
             result = std::numeric_limits<T>::max();
         }
-        else if((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
+        else if ((val_a != 0) && (val_b < (std::numeric_limits<T>::max() / val_a)))
         {
             result = std::numeric_limits<T>::min();
         }
@@ -158,12 +161,12 @@ T safe_integer_mul(T val_a, T val_b)
  *
  * @return The quotient
  */
-template <typename T, REQUIRES_TA(std::is_integral<T>::value)>
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
 T safe_integer_div(T val_a, T val_b)
 {
     T result = 0;
 
-    if((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
+    if ((val_b == 0) || ((val_a == std::numeric_limits<T>::min()) && (val_b == -1)))
     {
         result = std::numeric_limits<T>::min();
     }
@@ -174,7 +177,7 @@ T safe_integer_div(T val_a, T val_b)
 
     return result;
 }
-} // namespace cast
+} // namespace math
 } // namespace utils
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_UTILS_MATH_SAFE_OPS */
diff --git a/arm_compute/core/utils/misc/Cast.h b/arm_compute/core/utils/misc/Cast.h
deleted file mode 100644
index fc6246aace..0000000000
--- a/arm_compute/core/utils/misc/Cast.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_CAST_H
-#define ARM_COMPUTE_MISC_CAST_H
-
-#include "arm_compute/core/Error.h"
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace cast
-{
-/** Polymorphic cast between two types
- *
- * @warning Will throw an exception if cast cannot take place
- *
- * @tparam Target Target to cast type
- * @tparam Source Source from cast type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source>
-inline Target polymorphic_cast(Source *v)
-{
-    if(dynamic_cast<Target>(v) == nullptr)
-    {
-        ARM_COMPUTE_THROW(std::bad_cast());
-    }
-    return static_cast<Target>(v);
-}
-
-/** Polymorphic down cast between two types
- *
- * @warning Will assert if cannot take place
- *
- * @tparam Target Target to cast type
- * @tparam Source Source from cast type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source>
-inline Target polymorphic_downcast(Source *v)
-{
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<Target>(v) != static_cast<Target>(v));
-    return static_cast<Target>(v);
-}
-
-/** Polymorphic cast between two unique pointer types
- *
- * @warning Will throw an exception if cast cannot take place
- *
- * @tparam Target  Target to cast type
- * @tparam Source  Source from cast type
- * @tparam Deleter Deleter function type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source, typename Deleter>
-std::unique_ptr<Target, Deleter> polymorphic_cast_unique_ptr(std::unique_ptr<Source, Deleter> &&v)
-{
-    if(dynamic_cast<Target *>(v.get()) == nullptr)
-    {
-        ARM_COMPUTE_THROW(std::bad_cast());
-    }
-    auto r = static_cast<Target *>(v.release());
-    return std::unique_ptr<Target, Deleter>(r, std::move(v.get_deleter()));
-}
-
-/** Polymorphic down cast between two unique pointer types
- *
- * @warning Will assert if cannot take place
- *
- * @tparam Target  Target to cast type
- * @tparam Source  Source from cast type
- * @tparam Deleter Deleter function type
- *
- * @param[in] v Value to cast
- *
- * @return The casted value
- */
-template <typename Target, typename Source, typename Deleter>
-std::unique_ptr<Target, Deleter> polymorphic_downcast_unique_ptr(std::unique_ptr<Source, Deleter> &&v)
-{
-    ARM_COMPUTE_ERROR_ON(dynamic_cast<Target *>(v.get()) != static_cast<Target *>(v.get()));
-    auto r = static_cast<Target *>(v.release());
-    return std::unique_ptr<Target, Deleter>(r, std::move(v.get_deleter()));
-}
-} // namespace cast
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_CAST_H */
diff --git a/arm_compute/core/utils/misc/ICloneable.h b/arm_compute/core/utils/misc/ICloneable.h
deleted file mode 100644
index 064f408201..0000000000
--- a/arm_compute/core/utils/misc/ICloneable.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_ICLONEABLE_H
-#define ARM_COMPUTE_MISC_ICLONEABLE_H
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace misc
-{
-/** Clonable Interface */
-template <class T>
-class ICloneable
-{
-public:
-    /** Default virtual desctructor */
-    virtual ~ICloneable() = default;
-    /** Provide a clone of the current object of class T
-     *
-     * @return Clone object of class T
-     */
-    virtual std::unique_ptr<T> clone() const = 0;
-};
-} // namespace misc
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_ICLONEABLE_H */
diff --git a/arm_compute/core/utils/misc/InfoHelpers.h b/arm_compute/core/utils/misc/InfoHelpers.h
index c6ee7c9031..1d1b4ea8d7 100644
--- a/arm_compute/core/utils/misc/InfoHelpers.h
+++ b/arm_compute/core/utils/misc/InfoHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -53,10 +53,12 @@ inline bool is_relu(ActivationLayerInfo activation_info)
  */
 inline bool is_relu6(ActivationLayerInfo activation_info)
 {
-    const bool is_lu_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                    && activation_info.a() == 6.f && activation_info.b() == 0.f;
-    const bool is_bounded_relu = activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                 && activation_info.a() == 6.f;
+    const bool is_lu_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU &&
+        activation_info.a() == 6.f && activation_info.b() == 0.f;
+    const bool is_bounded_relu =
+        activation_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        activation_info.a() == 6.f;
     return activation_info.enabled() && (is_lu_bounded_relu || is_bounded_relu);
 }
 
@@ -68,50 +70,52 @@ inline bool is_relu6(ActivationLayerInfo activation_info)
  *
  */
 template <typename T>
-inline void build_lstm_params_tensor_info(const LSTMParams<T>     &lstm_params,
-                                          LSTMParams<ITensorInfo> *lstm_params_info)
+inline void build_lstm_params_tensor_info(const LSTMParams<T> &lstm_params, LSTMParams<ITensorInfo> *lstm_params_info)
 {
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(), lstm_params.cell_to_output_weights()->info());
+        lstm_params_info->set_peephole_params(lstm_params.cell_to_forget_weights()->info(),
+                                              lstm_params.cell_to_output_weights()->info());
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.projection_weights());
-        lstm_params_info->set_projection_params(lstm_params.projection_weights()->info(),
-                                                lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
+        lstm_params_info->set_projection_params(
+            lstm_params.projection_weights()->info(),
+            lstm_params.projection_bias() != nullptr ? lstm_params.projection_bias()->info() : nullptr);
     }
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(),
+                                     lstm_params.input_gate_bias());
 
-        const ITensorInfo *cell_to_input_weights_info = (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
-        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(), lstm_params.recurrent_to_input_weights()->info(),
-                                          cell_to_input_weights_info, lstm_params.input_gate_bias()->info());
+        ITensorInfo *cell_to_input_weights_info =
+            (lstm_params.has_peephole_opt()) ? lstm_params.cell_to_input_weights()->info() : nullptr;
+        lstm_params_info->set_cifg_params(lstm_params.input_to_input_weights()->info(),
+                                          lstm_params.recurrent_to_input_weights()->info(), cell_to_input_weights_info,
+                                          lstm_params.input_gate_bias()->info());
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
-                                     lstm_params.output_layer_norm_weights(),
+        ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.output_layer_norm_weights(),
                                      lstm_params.cell_layer_norm_weights());
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(lstm_params.input_layer_norm_weights());
         }
 
-        const ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
-        const ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
-        const ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
-        const ITensorInfo *input_info  = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
+        ITensorInfo *forget_info = lstm_params.forget_layer_norm_weights()->info();
+        ITensorInfo *cell_info   = lstm_params.cell_layer_norm_weights()->info();
+        ITensorInfo *output_info = lstm_params.output_layer_norm_weights()->info();
+        ITensorInfo *input_info = lstm_params.has_cifg_opt() ? nullptr : lstm_params.input_layer_norm_weights()->info();
 
         lstm_params_info->set_layer_normalization_params(input_info, forget_info, cell_info, output_info);
     }
 
-    lstm_params_info->set_matmul_scale_params(lstm_params.input_intermediate_scale(),
-                                              lstm_params.forget_intermediate_scale(),
-                                              lstm_params.cell_intermediate_scale(),
-                                              lstm_params.output_intermediate_scale());
+    lstm_params_info->set_matmul_scale_params(
+        lstm_params.input_intermediate_scale(), lstm_params.forget_intermediate_scale(),
+        lstm_params.cell_intermediate_scale(), lstm_params.output_intermediate_scale());
 
     lstm_params_info->set_hidden_state_params(lstm_params.hidden_state_zero(), lstm_params.hidden_state_scale());
 }
diff --git a/arm_compute/core/utils/misc/Iterable.h b/arm_compute/core/utils/misc/Iterable.h
deleted file mode 100644
index 829c4b44a8..0000000000
--- a/arm_compute/core/utils/misc/Iterable.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_ITERABLE_H
-#define ARM_COMPUTE_MISC_ITERABLE_H
-
-#include <iterator>
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace iterable
-{
-/** Reverse range iterable class
- *
- * @tparam T Type to create a reverse range on
- */
-template <typename T>
-class reverse_iterable
-{
-public:
-    /** Default constructor
-     *
-     * @param[in] it Value to reverse iterate on
-     */
-    explicit reverse_iterable(T &it)
-        : _it(it)
-    {
-    }
-
-    /** Get beginning of iterator.
-     *
-     * @return beginning of iterator.
-     */
-    typename T::reverse_iterator begin()
-    {
-        return _it.rbegin();
-    }
-
-    /** Get end of iterator.
-     *
-     * @return end of iterator.
-     */
-    typename T::reverse_iterator end()
-    {
-        return _it.rend();
-    }
-
-    /** Get beginning of const iterator.
-     *
-     * @return beginning of const iterator.
-     */
-    typename T::const_reverse_iterator cbegin()
-    {
-        return _it.rbegin();
-    }
-
-    /** Get end of const iterator.
-     *
-     * @return end of const iterator.
-     */
-    typename T::const_reverse_iterator cend()
-    {
-        return _it.rend();
-    }
-
-private:
-    T &_it;
-};
-
-/** Creates a reverse iterable for a given type
- *
- * @tparam T Type to create a reverse iterable on
- *
- * @param[in] val Iterable input
- *
- * @return Reverse iterable container
- */
-template <typename T>
-reverse_iterable<T> reverse_iterate(T &val)
-{
-    return reverse_iterable<T>(val);
-}
-} // namespace iterable
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_ITERABLE_H */
diff --git a/arm_compute/core/utils/misc/MMappedFile.h b/arm_compute/core/utils/misc/MMappedFile.h
index 7669c5cc96..3efdbc5bda 100644
--- a/arm_compute/core/utils/misc/MMappedFile.h
+++ b/arm_compute/core/utils/misc/MMappedFile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_MISC_MMAPPED_FILE_H
 #define ARM_COMPUTE_MISC_MMAPPED_FILE_H
 
-#if !defined(BARE_METAL)
+#if !defined(_WIN64) && !defined(BARE_METAL)
 
 #include <string>
 #include <utility>
@@ -105,6 +105,6 @@ private:
 } // namespace mmap_io
 } // namespace utils
 } // namespace arm_compute
-#endif // !defined(BARE_METAL)
+#endif // !defined(_WIN64) &&!defined(BARE_METAL)
 
 #endif /* ARM_COMPUTE_MISC_MMAPPED_FILE_H */
diff --git a/arm_compute/core/utils/misc/Macros.h b/arm_compute/core/utils/misc/Macros.h
index 6e8d7659ee..fa861fa442 100644
--- a/arm_compute/core/utils/misc/Macros.h
+++ b/arm_compute/core/utils/misc/Macros.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,15 +26,16 @@
 
 #if defined(__cplusplus) && (__cplusplus >= 201402L)
 
-#define ARM_COMPUTE_DEPRECATED [[deprecated]]
-#define ARM_COMPUTE_DEPRECATED_REL(rel) [[deprecated("Deprecated in : " #rel)]]
+#define ARM_COMPUTE_DEPRECATED                           [[deprecated]]
+#define ARM_COMPUTE_DEPRECATED_REL(rel)                  [[deprecated("Deprecated in : " #rel)]]
 #define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) [[deprecated("Deprecated in : " #rel " - Use : " #replace)]]
 
 #elif defined(__GNUC__) || defined(__clang__)
 
-#define ARM_COMPUTE_DEPRECATED __attribute__((deprecated))
+#define ARM_COMPUTE_DEPRECATED          __attribute__((deprecated))
 #define ARM_COMPUTE_DEPRECATED_REL(rel) __attribute__((deprecated("Deprecated in : " #rel)))
-#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
+#define ARM_COMPUTE_DEPRECATED_REL_REPLACE(rel, replace) \
+    __attribute__((deprecated("Deprecated in : " #rel " - Use : " #replace)))
 
 #else // defined(__cplusplus) && (__cplusplus >= 201402L)
 
diff --git a/arm_compute/core/utils/misc/Random.h b/arm_compute/core/utils/misc/Random.h
deleted file mode 100644
index 9f5a128546..0000000000
--- a/arm_compute/core/utils/misc/Random.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MISC_RANDOM_H
-#define ARM_COMPUTE_MISC_RANDOM_H
-
-#include "arm_compute/core/Error.h"
-
-#include <random>
-#include <type_traits>
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace random
-{
-/** Uniform distribution within a given number of sub-ranges
- *
- * @tparam T Distribution primitive type
- */
-template <typename T>
-class RangedUniformDistribution
-{
-public:
-    using DT = typename std::conditional<std::is_integral<T>::value,
-          std::uniform_int_distribution<T>,
-          std::uniform_real_distribution<float>>::type;
-    using result_type = T;
-    using range_pair  = std::pair<result_type, result_type>;
-
-public:
-    /** Constructor
-     *
-     * @param[in] low            lowest value in the range (inclusive)
-     * @param[in] high           highest value in the range (inclusive for uniform_int_distribution, exclusive for uniform_real_distribution)
-     * @param[in] exclude_ranges Ranges to exclude from the generator
-     */
-    RangedUniformDistribution(result_type low, result_type high, const std::vector<range_pair> &exclude_ranges)
-        : _distributions(), _selector()
-    {
-        result_type clow = low;
-        for(const auto &erange : exclude_ranges)
-        {
-            result_type epsilon = std::is_integral<result_type>::value ? 1 : static_cast<result_type>(std::numeric_limits<float>::epsilon());
-
-            ARM_COMPUTE_ERROR_ON(clow > erange.first || clow >= erange.second);
-
-            _distributions.emplace_back(DT(clow, erange.first - epsilon));
-            clow = erange.second + epsilon;
-        }
-        ARM_COMPUTE_ERROR_ON(clow > high);
-        _distributions.emplace_back(DT(clow, high));
-        _selector = std::uniform_int_distribution<uint32_t>(0, _distributions.size() - 1);
-    }
-    /** Generate random number
-     *
-     * @tparam URNG Random number generator object type
-     *
-     * @param[in] g A uniform random number generator object, used as the source of randomness.
-     *
-     * @return A new random number.
-     */
-    template <class URNG>
-    result_type operator()(URNG &g)
-    {
-        unsigned int rand_select = _selector(g);
-        return _distributions[rand_select](g);
-    }
-
-private:
-    std::vector<DT>                         _distributions;
-    std::uniform_int_distribution<uint32_t> _selector;
-};
-} // namespace random
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_RANDOM_H */
diff --git a/arm_compute/core/utils/misc/Requires.h b/arm_compute/core/utils/misc/Requires.h
deleted file mode 100644
index 33c6fa3096..0000000000
--- a/arm_compute/core/utils/misc/Requires.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_REQUIRES_H
-#define ARM_COMPUTE_UTILS_REQUIRES_H
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace requires
-{
-// *INDENT-OFF*
-// clang-format off
-namespace detail
-{
-enum class enabler
-{
-};
-} // namespace arm_compute
-
-/** Requirements as template */
-#define REQUIRES_T(...) template <bool Cond = (__VA_ARGS__), typename std::enable_if<Cond, int>::type = 0>
-/** Requirements as template argument */
-#define REQUIRES_TA(...) typename = typename std::enable_if<(__VA_ARGS__), arm_compute::utils::requires::detail::enabler>::type
-// clang-format on
-// *INDENT-ON*
-} // namespace requires
-} // namespace utils
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_UTILS_REQUIRES_H */
diff --git a/arm_compute/core/utils/misc/Rounding.h b/arm_compute/core/utils/misc/Rounding.h
deleted file mode 100644
index 650137a473..0000000000
--- a/arm_compute/core/utils/misc/Rounding.h
+++ /dev/null
@@ -1,205 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_ROUNDING_H
-#define ARM_COMPUTE_UTILS_ROUNDING_H
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/utils/misc/Requires.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "support/ToolchainSupport.h"
-
-#include <cmath>
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace rounding
-{
-/** Rounding mode */
-enum class RoundingMode
-{
-    TO_ZERO,             /**< Round towards zero */
-    AWAY_FROM_ZERO,      /**< Round away from zero */
-    HALF_TO_ZERO,        /**< Round half towards from zero */
-    HALF_AWAY_FROM_ZERO, /**< Round half away from zero */
-    HALF_UP,             /**< Round half towards positive infinity */
-    HALF_DOWN,           /**< Round half towards negative infinity */
-    HALF_EVEN            /**< Round half towards nearest even */
-};
-
-/** Round floating-point value with round to zero
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_to_zero(T value)
-{
-    T res = std::floor(std::fabs(value));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with round away from zero
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_away_from_zero(T value)
-{
-    T res = std::ceil(std::fabs(value));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with half value rounding towards zero.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_to_zero(T value)
-{
-    T res = T(std::ceil(std::fabs(value) - 0.5f));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with half value rounding away from zero.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_away_from_zero(T value)
-{
-    T res = T(std::floor(std::fabs(value) + 0.5f));
-    return (value < 0.f) ? -res : res;
-}
-
-/** Round floating-point value with half value rounding to positive infinity.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_up(T value)
-{
-    return std::floor(value + 0.5f);
-}
-
-/** Round floating-point value with half value rounding to negative infinity.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value floating-point value to be rounded.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_down(T value)
-{
-    return std::ceil(value - 0.5f);
-}
-
-/** Round floating-point value with half value rounding to nearest even.
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value   floating-point value to be rounded.
- * @param[in] epsilon precision.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round_half_even(T value, T epsilon = std::numeric_limits<T>::epsilon())
-{
-    T positive_value = std::abs(value);
-    T ipart          = 0;
-    std::modf(positive_value, &ipart);
-    // If 'value' is exactly halfway between two integers
-    if(std::abs(positive_value - (ipart + 0.5f)) < epsilon)
-    {
-        // If 'ipart' is even then return 'ipart'
-        if(std::fmod(ipart, 2.f) < epsilon)
-        {
-            return support::cpp11::copysign(ipart, value);
-        }
-        // Else return the nearest even integer
-        return support::cpp11::copysign(std::ceil(ipart + 0.5f), value);
-    }
-    // Otherwise use the usual round to closest
-    return support::cpp11::copysign(support::cpp11::round(positive_value), value);
-}
-
-/** Round floating-point value given a rounding mode
- *
- * @tparam T Parameter type. Should be of floating point type.
- *
- * @param[in] value         floating-point value to be rounded.
- * @param[in] rounding_mode Rounding mode to use.
- *
- * @return Floating-point value of rounded @p value.
- */
-template <typename T, REQUIRES_TA(traits::is_floating_point<T>::value)>
-inline T round(T value, RoundingMode rounding_mode)
-{
-    switch(rounding_mode)
-    {
-        case RoundingMode::TO_ZERO:
-            return round_to_zero(value);
-        case RoundingMode::AWAY_FROM_ZERO:
-            return round_away_from_zero(value);
-        case RoundingMode::HALF_TO_ZERO:
-            return round_half_to_zero(value);
-        case RoundingMode::HALF_AWAY_FROM_ZERO:
-            return round_half_away_from_zero(value);
-        case RoundingMode::HALF_UP:
-            return round_half_up(value);
-        case RoundingMode::HALF_DOWN:
-            return round_half_down(value);
-        case RoundingMode::HALF_EVEN:
-            return round_half_even(value);
-        default:
-            ARM_COMPUTE_ERROR("Unsupported rounding mode!");
-    }
-}
-} // namespace rounding
-} // namespace utils
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_UTILS_ROUNDING_H */
diff --git a/arm_compute/core/utils/misc/SaturateCast.h b/arm_compute/core/utils/misc/SaturateCast.h
deleted file mode 100644
index 0241c64b14..0000000000
--- a/arm_compute/core/utils/misc/SaturateCast.h
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
-#define ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H
-
-#include "arm_compute/core/utils/misc/Rounding.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-
-namespace arm_compute
-{
-namespace utils
-{
-namespace cast
-{
-// *INDENT-OFF*
-// clang-format off
-// same type
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_same<T, U>::value, int >::type = 0 >
-T saturate_cast(U v)
-{
-    return v;
-}
-
-// signed -> signed widening/same_width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) >= sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-// signed -> signed narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(utility::clamp<U>(v, std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()));
-}
-
-// unsigned -> signed widening
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 (sizeof(T) > sizeof(U)),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-// unsigned -> signed narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(std::min<U>(v, std::numeric_limits<T>::max()));
-}
-// unsigned -> signed same_width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<U>() &&
-                                 std::is_signed<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) == sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(std::min<U>(v, std::numeric_limits<T>::max()));
-}
-
-// signed -> unsigned widening/same width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_unsigned<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) >= sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(std::max<U>(0, v));
-}
-
-// signed -> unsigned narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_signed<U>() &&
-                                 std::is_unsigned<T>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(utility::clamp<U>(v, 0, std::numeric_limits<T>::max()));
-}
-
-// unsigned -> unsigned widening/same width
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<T>() &&
-                                 std::is_unsigned<U>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) >= sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-
-// unsigned -> unsigned narrowing
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 std::is_integral<U>::value &&
-                                 std::is_unsigned<T>() &&
-                                 std::is_unsigned<U>() &&
-                                 !std::is_same<T, U>::value &&
-                                 sizeof(T) < sizeof(U),
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(utility::clamp<U>(v, std::numeric_limits<T>::lowest(), std::numeric_limits<T>::max()));
-}
-
-// float -> int
-template<typename T,
-         typename U,
-         typename std::enable_if<std::is_integral<T>::value &&
-                                 traits::is_floating_point<U>::value,
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    int32_t vi = utils::rounding::round_half_away_from_zero(v);
-    return saturate_cast<T>(vi);
-}
-
-// int -> float
-template<typename T,
-         typename U,
-         typename std::enable_if<traits::is_floating_point<T>::value &&
-                                 std::is_integral<U>::value,
-                  int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-
-// float -> float
-template<typename T,
-        typename U,
-        typename std::enable_if<traits::is_floating_point<T>::value &&
-                                traits::is_floating_point<U>::value,
-                int >::type = 0 >
-inline T saturate_cast(U v)
-{
-    return static_cast<T>(v);
-}
-// clang-format on
-// *INDENT-ON*
-} // namespace cast
-} // namespace utils
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_UTILS_CAST_SATURATE_CAST_H */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index dfccec8b37..e97d81390e 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,15 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H
-#define ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H
+#ifndef ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
+#define ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Utils.h"
-
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
 
 #include <cmath>
 
@@ -47,28 +48,35 @@ namespace shape_calculator
  *
  * @return the calculated shape
  */
-inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates &reduction_axis, bool keep_dims)
+inline TensorShape calculate_reduce_mean_shape(ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims)
 {
     const int   reduction_ops = reduction_axis.num_dimensions();
     Coordinates axis_local    = reduction_axis;
-    const int   input_dims    = input->info()->num_dimensions();
+    const int   input_dims    = input->num_dimensions();
     convert_negative_axis(axis_local, input_dims);
-    TensorShape out_shape = input->info()->tensor_shape();
+    TensorShape out_shape = input->tensor_shape();
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
+
+// Suppress warning produced by a compiler bug in GCC
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104165
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Warray-bounds"
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(int i = 0; i < reduction_ops; ++i)
+#pragma GCC diagnostic pop
+
+        for (int i = 0; i < reduction_ops; ++i)
         {
-            out_shape.remove_dimension(axis_local[i] - i);
+            out_shape.remove_dimension(axis_local[i] - i, false);
         }
         return out_shape;
     }
     else
     {
-        for(int i = 0; i < reduction_ops; ++i)
+        for (int i = 0; i < reduction_ops; ++i)
         {
             out_shape.set(axis_local[i], 1);
         }
@@ -84,7 +92,10 @@ inline TensorShape calculate_reduce_mean_shape(ITensor *input, const Coordinates
  *
  * @return the calculated shape
  */
-inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
+inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input,
+                                                         size_t             conv_w,
+                                                         size_t             conv_h,
+                                                         const DataLayout  &data_layout)
 {
     const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -126,10 +137,12 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
     const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_width] % stride != 0),
+                             "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input.tensor_shape()[idx_height] % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     output_shape.set(idx_width, output_shape[idx_width] / stride);
     output_shape.set(idx_height, output_shape[idx_height] / stride);
@@ -146,7 +159,8 @@ inline TensorShape compute_reorg_output_shape(const ITensorInfo &input, int32_t
  *
  * @return the calculated shape of the reshaped weights
  */
-inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
+inline TensorShape
+compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
 {
     // Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
@@ -154,14 +168,14 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
     ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
 
     // Calculate output shape
-    TensorShape weights_reshaped{ weights.tensor_shape() };
+    TensorShape weights_reshaped{weights.tensor_shape()};
     weights_reshaped.set(3, weights_reshaped[3] / num_groups);
 
     weights_reshaped.collapse(3);
     const size_t tmp_dim = weights_reshaped[0];
     weights_reshaped.set(0, weights_reshaped[1]);
     weights_reshaped.set(1, tmp_dim + (has_bias ? 1 : 0));
-    if(weights.num_dimensions() < 5)
+    if (weights.num_dimensions() < 5)
     {
         weights_reshaped.set(2, num_groups);
     }
@@ -177,7 +191,9 @@ inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d = false)
+inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo       &a,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d = false)
 {
     ARM_COMPUTE_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_ERROR_ON(lhs_info.k0 == 0);
@@ -198,11 +214,11 @@ inline TensorShape compute_lhs_reshaped_shape(const ITensorInfo &a, const GEMMLH
     const unsigned int output_width  = block_size * num_horiz_blocks * lhs_info.v0;
     const unsigned int output_height = std::ceil(num_vert_blocks / static_cast<float>(lhs_info.v0));
 
-    TensorShape lhs_shape{ a.tensor_shape() };
+    TensorShape lhs_shape{a.tensor_shape()};
     lhs_shape.set(0, output_width);
     lhs_shape.set(1, output_height);
 
-    if((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
+    if ((reinterpret_input_as_3d) && (lhs_shape.num_dimensions() > 2))
     {
         // When the data format is NHWC and the shapes are Nx1x1
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
@@ -242,7 +258,7 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
     const unsigned int output_width  = block_size * num_vert_blocks * rhs_info.h0;
     const unsigned int output_height = std::ceil(num_horiz_blocks / static_cast<float>(rhs_info.h0));
 
-    TensorShape rhs_shape{ a.tensor_shape() };
+    TensorShape rhs_shape{a.tensor_shape()};
     rhs_shape.set(0, output_width);
     rhs_shape.set(1, output_height);
 
@@ -257,14 +273,15 @@ inline TensorShape compute_rhs_reshaped_shape(const ITensorInfo &a, const GEMMRH
  *
  * @return the calculated shape
  */
-inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
+inline TensorShape
+compute_interleaved_shape(const ITensorInfo &a, int mult_interleave4x4_height = 1, bool reinterpret_input_as_3d = false)
 {
     // The interleaved output matrix will have the following shape: [ a_height * W, ceil(a_width / W) ] where W = 4 * mult_interleave4x4_height
     ARM_COMPUTE_ERROR_ON(mult_interleave4x4_height < 1);
     const int   interleave_width = 4 * mult_interleave4x4_height;
-    TensorShape shape_interleaved_a{ a.tensor_shape() };
+    TensorShape shape_interleaved_a{a.tensor_shape()};
     shape_interleaved_a.set(0, a.dimension(0) * interleave_width);
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         const int M      = a.dimension(1) * a.dimension(2);
         const int height = std::ceil(M / static_cast<float>(interleave_width));
@@ -274,7 +291,7 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
         // the tensor shape num_dimensions is automatically set to 1 instead of 3.
         // To avoid failures by removing a dimension that doesn't exist
         // check if the number of dimensions is greater than 2.
-        if(shape_interleaved_a.num_dimensions() > 2)
+        if (shape_interleaved_a.num_dimensions() > 2)
         {
             shape_interleaved_a.remove_dimension(2);
         }
@@ -287,30 +304,6 @@ inline TensorShape compute_interleaved_shape(const ITensorInfo &a, int mult_inte
     return shape_interleaved_a;
 }
 
-/** Calculate the reshaped shape of the weights to use in depthwise convolution
- *
- * @param[in] input Input tensor info
- * @param[in] info  Depthwise convolution information to be used for reshaping.
- *
- * @return the calculated shape
- */
-inline TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &input, const DepthwiseConvolutionReshapeInfo &info)
-{
-    const auto  data_layout = input.data_layout();
-    TensorShape weights_shape{};
-
-    const int    width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int    channel_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const size_t num_channels = input.dimension(channel_idx);
-    const size_t num_rows     = input.dimension(height_idx);
-    const size_t num_cols     = input.dimension(width_idx);
-
-    weights_shape.set(0, num_rows * num_cols * info.c0);
-    weights_shape.set(1, DIV_CEIL(num_channels, info.c0));
-    return weights_shape;
-}
-
 /** Calculate the transposed 1xW shape
  *
  * @param[in] b Input tensor info
@@ -320,7 +313,7 @@ inline TensorShape compute_reshaped_depthwise_weights_shape(const ITensorInfo &i
 inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
 {
     // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
-    TensorShape shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape shape_transposed1xW_b{b.tensor_shape()};
     shape_transposed1xW_b.set(0, b.dimension(1) * 16);
     shape_transposed1xW_b.set(1, std::ceil(b.dimension(0) / 16.f));
 
@@ -340,7 +333,7 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
     //       The transpose1xW output matrix will have the following shape:
     //       [ b_height * W, ceil(b_width / W) ] where W = (16 / element size of the tensor) * mult_transpose1xW_width
     ARM_COMPUTE_ERROR_ON(mult_transpose1xW_width < 1);
-    TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
+    TensorShape  shape_transposed1xW_b{b.tensor_shape()};
     const size_t transpose_width = (16 / b.element_size()) * mult_transpose1xW_width;
     shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
     shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
@@ -356,8 +349,8 @@ inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInf
  */
 inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
 {
-    TensorShape shape_vector_sum_col{ b.tensor_shape() };
-    if(shape_vector_sum_col.num_dimensions() > 1)
+    TensorShape shape_vector_sum_col{b.tensor_shape()};
+    if (shape_vector_sum_col.num_dimensions() > 1)
     {
         shape_vector_sum_col.remove_dimension(1);
     }
@@ -373,9 +366,9 @@ inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
  */
 inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
 {
-    TensorShape shape_vector_sum_row{ a.tensor_shape() };
+    TensorShape shape_vector_sum_row{a.tensor_shape()};
     shape_vector_sum_row.set(Window::DimX, a.dimension(1));
-    if(shape_vector_sum_row.num_dimensions() > 1)
+    if (shape_vector_sum_row.num_dimensions() > 1)
     {
         shape_vector_sum_row.remove_dimension(1);
     }
@@ -392,7 +385,10 @@ inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
  *
  * @return the calculated shape
  */
-inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &convolved_dims, bool batch_size_on_z, unsigned int num_groups = 1)
+inline TensorShape compute_col2im_shape(const ITensorInfo &input,
+                                        const Size2D      &convolved_dims,
+                                        bool               batch_size_on_z,
+                                        unsigned int       num_groups = 1)
 {
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
     ARM_COMPUTE_ERROR_ON(input.tensor_shape()[1] != (convolved_dims.area()));
@@ -403,10 +399,10 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape col2im_shape{ input.tensor_shape() };
+    TensorShape col2im_shape{input.tensor_shape()};
     // If batches start on 3rd dimension shift dimensions right by 1 to retain upper tensor shape,
     // as first three will be override by H,W,C data
-    if(batch_size_on_z && num_groups == 1)
+    if (batch_size_on_z && num_groups == 1)
     {
         col2im_shape.shift_right(1);
     }
@@ -425,29 +421,27 @@ inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &
  */
 inline TensorShape compute_transposed_shape(const ITensorInfo &input)
 {
-    TensorShape shape_transposed{ input.tensor_shape() };
+    TensorShape shape_transposed{input.tensor_shape()};
 
-    shape_transposed.set(0, input.dimension(1));
-    shape_transposed.set(1, input.dimension(0));
+    shape_transposed.set(0, input.dimension(1), false);
+    shape_transposed.set(1, input.dimension(0), false);
 
     return shape_transposed;
 }
 
 /** Calculate the depthwise convolution output shape of a tensor
  *
- * @param[in] input            Input tensor info
- * @param[in] weights          Weights tensor info
- * @param[in] conv_info        Padding and stride information to use for the convolution.
- * @param[in] depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth.
- * @param[in] dilation         Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] input   Input tensor info
+ * @param[in] weights Weights tensor info
+ * @param[in] info    Convolution info
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation = Size2D(1U,
-                                                       1U))
+inline TensorShape
+compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -455,23 +449,54 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
     const DataLayout weights_data_layout = weights.data_layout();
-    const int        weights_width_idx   = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
-    const int        weights_height_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
+    const int weights_width_idx  = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::WIDTH);
+    const int weights_height_idx = get_data_layout_dimension_index(weights_data_layout, DataLayoutDimension::HEIGHT);
 
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_shape[width_idx], input_shape[height_idx],
-                                                              weights_shape[weights_width_idx], weights_shape[weights_height_idx],
-                                                              conv_info, dilation);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_shape[width_idx], input_shape[height_idx], weights_shape[weights_width_idx],
+                          weights_shape[weights_height_idx], info.pad_stride_info, info.dilation);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(width_idx, output_width);
     output_shape.set(height_idx, output_height);
-    output_shape.set(channel_idx, input_shape[channel_idx] * depth_multiplier);
+    output_shape.set(channel_idx, input_shape[channel_idx] * info.depth_multiplier);
 
     return output_shape;
 }
 
+/** Calculate padding required for deconvolution
+ *
+ * @param[in] input    Input tensor info
+ * @param[in] weights  Weights tensor shape
+ * @param[in] sx       Stride on x axis
+ * @param[in] sy       Stride on y axis
+ * @param[in] out_dims Output shape dimensions
+ *
+ * @return the padding required
+ */
+inline std::pair<int32_t, int32_t> compute_deconvolution_padding(const ITensorInfo            &input,
+                                                                 const ITensorInfo            &weights,
+                                                                 int32_t                       sx,
+                                                                 int32_t                       sy,
+                                                                 std::pair<uint32_t, uint32_t> out_dims)
+{
+    const DataLayout data_layout = input.data_layout();
+    const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Find the upsampled dimensions
+    int32_t out_x = (static_cast<int32_t>(input.dimension(idx_w)) - 1) * sx + 1;
+    int32_t out_y = (static_cast<int32_t>(input.dimension(idx_h)) - 1) * sy + 1;
+
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    int32_t padx = out_dims.first - (out_x - static_cast<int32_t>(weights.dimension(idx_w)) + 1);
+    int32_t pady = out_dims.second - (out_y - static_cast<int32_t>(weights.dimension(idx_h)) + 1);
+
+    return std::make_pair(padx, pady);
+}
+
 /** Calculate the upsampled output shape used for deconvolution
  *
  * @param[in] input    Input tensor info
@@ -484,20 +509,28 @@ inline TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &input, const ITensorInfo &weights, unsigned int sx, unsigned int sy,
-                                                         std::pair<unsigned int, unsigned int> &out_dims, uint32_t &padx, uint32_t &pady)
+inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo                     &input,
+                                                         const ITensorInfo                     &weights,
+                                                         unsigned int                           sx,
+                                                         unsigned int                           sy,
+                                                         std::pair<unsigned int, unsigned int> &out_dims,
+                                                         uint32_t                              &padx,
+                                                         uint32_t                              &pady)
 {
+    // Find the padding needed for the convolution with stride 1 in order to match output shape
+    const auto padxy =
+        compute_deconvolution_padding(input, weights, static_cast<int32_t>(sx), static_cast<int32_t>(sy), out_dims);
+    padx = static_cast<uint32_t>(padxy.first);
+    pady = static_cast<uint32_t>(padxy.second);
+
     const DataLayout data_layout = input.data_layout();
     const size_t     idx_w       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const size_t     idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Find the upsampled dimensions
-    unsigned int out_x = (input.dimension(idx_w) - 1) * sx + 1;
-    unsigned int out_y = (input.dimension(idx_h) - 1) * sy + 1;
+    uint32_t out_x = (input.dimension(idx_w) - 1) * sx + 1;
+    uint32_t out_y = (input.dimension(idx_h) - 1) * sy + 1;
 
-    // Find the padding needed for the convolution with stride 1 in order to match output shape
-    padx = out_dims.first - (out_x - weights.dimension(idx_w) + 1);
-    pady = out_dims.second - (out_y - weights.dimension(idx_h) + 1);
     out_x += padx;
     out_y += pady;
 
@@ -516,10 +549,12 @@ inline TensorShape compute_deconvolution_upsampled_shape(const ITensorInfo &inpu
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims, const ITensorInfo &input, const ITensorInfo &weights)
+inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned int, unsigned int> &out_dims,
+                                                      const ITensorInfo                           &input,
+                                                      const ITensorInfo                           &weights)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
+    const TensorShape input_shape{input.tensor_shape()};
+    const TensorShape weights_shape{weights.tensor_shape()};
 
     const DataLayout data_layout = input.data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -527,7 +562,7 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const int        batch_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    TensorShape out_shape{ input_shape };
+    TensorShape out_shape{input_shape};
     out_shape.set(width_idx, out_dims.first);
     out_shape.set(height_idx, out_dims.second);
     out_shape.set(channel_idx, weights_shape[batch_idx]);
@@ -543,11 +578,18 @@ inline TensorShape compute_deconvolution_output_shape(const std::pair<unsigned i
  * @param[in] dilation        Dilation, in elements, across x and y
  * @param[in] batch_size_on_z True if batch size is on z axis
  * @param[in] num_groups      (Optional)  Number of groups when performing a grouped convolution
+ * @param[in] input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
  *
  * @return the calculated shape
  */
-inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, bool batch_size_on_z,
-                                             unsigned int num_groups = 1)
+inline TensorShape compute_im2col_conv_shape(const ITensorInfo   *input,
+                                             const Size2D        &kernel_dims,
+                                             const PadStrideInfo &conv_info,
+                                             bool                 has_bias,
+                                             const Size2D        &dilation,
+                                             bool                 batch_size_on_z,
+                                             unsigned int         num_groups      = 1,
+                                             unsigned int         input_pad_right = 0)
 {
     // The output shape will be the 3D shape [ out_channels * kernel_area, num_elems_per_out_channel, batches ]                           if batch_size_on_z == true
     //                       or the 4D shape [ out_channels * kernel_area / num_groups, num_elems_per_out_channel, num_groups, batches ]  if batch_size_on_z == false
@@ -556,17 +598,19 @@ inline TensorShape compute_im2col_conv_shape(const ITensorInfo *input, const Siz
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && input->data_layout() != DataLayout::NCHW);
     ARM_COMPUTE_ERROR_ON(num_groups > 1 && batch_size_on_z);
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
-    output_shape.set(0, (output_shape[channel_idx] / num_groups * kernel_dims.area() + (has_bias ? 1 : 0))); // NOLINT
+    std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(
+        output_shape[width_idx], output_shape[height_idx], kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    output_shape.set(0, ((output_shape[channel_idx] + input_pad_right) / num_groups * kernel_dims.area() +
+                         (has_bias ? 1 : 0))); // NOLINT
     output_shape.set(1, (out_dims.first * out_dims.second));
-    if(batch_size_on_z && output_shape.num_dimensions() >= 3)
+    if (batch_size_on_z && output_shape.num_dimensions() >= 3)
     {
         output_shape.remove_dimension(2);
     }
@@ -588,7 +632,7 @@ inline TensorShape compute_flatten_shape(const ITensorInfo *input)
 {
     // The output shape will be the flatten version of the input (i.e. [ width * height * channels, num_batches, ... ] ). Used for FlattenLayer and FullyConnectedLayer.
 
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     output_shape.collapse(3);
 
@@ -610,7 +654,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
     // - [x,y,z,w] and axis 3 will return [x*y*z, w]
     TensorShape shape2D = input->tensor_shape();
 
-    if(axis < input->num_dimensions())
+    if (axis < input->num_dimensions())
     {
         // Collapse from axis onward (this changes the shape)
         shape2D.collapse_from(axis);
@@ -624,7 +668,7 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
         shape2D.collapse(shape2D.num_dimensions());
     }
 
-    if(axis == 0)
+    if (axis == 0)
     {
         // If axis is zero the first dim should be one. Since
         // collapse is an inclusive operation we need to shift
@@ -643,15 +687,17 @@ inline TensorShape compute_softmax_shape(const ITensorInfo *input, size_t axis =
  */
 inline TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
 {
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     const Size2D kernel_size      = winograd_info.kernel_size;
     const Size2D output_tile_size = winograd_info.output_tile_size;
-    const Size2D input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     tensor_shape.remove_dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH));
     tensor_shape.set(Window::DimX, input.dimension(3));
-    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL)));
+    tensor_shape.set(Window::DimY, input.dimension(get_data_layout_dimension_index(input.data_layout(),
+                                                                                   DataLayoutDimension::CHANNEL)));
     tensor_shape.set(Window::DimZ, input_tile_size.area());
 
     return tensor_shape;
@@ -669,23 +715,22 @@ inline TensorShape compute_winograd_input_transform_shape(const ITensorInfo &inp
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
-    const Size2D        input_tile_size  = Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
+    const Size2D        input_tile_size =
+        Size2D(output_tile_size.width + kernel_size.width - 1, output_tile_size.height + kernel_size.height - 1);
 
     const size_t idx_w = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles = compute_winograd_convolution_tiles(
+        Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]), kernel_size, output_tile_size, conv_info);
 
     const unsigned int width  = input.tensor_shape()[idx_c];
     const unsigned int height = num_tiles.area();
     const unsigned int depth  = input_tile_size.area();
 
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
     output_shape.set(0, width);
     output_shape.set(1, height);
     output_shape.set(2, depth);
@@ -708,12 +753,12 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
     const DataLayout    data_layout      = winograd_info.output_data_layout;
 
     // Compute output shape
-    unsigned int output_width  = 0;
-    unsigned int output_height = 0;
+    unsigned int output_width             = 0;
+    unsigned int output_height            = 0;
     std::tie(output_width, output_height) = scaled_dimensions(input_dimensions.width, input_dimensions.height,
                                                               kernel_size.width, kernel_size.height, conv_info);
 
-    TensorShape tensor_shape{ input.tensor_shape() };
+    TensorShape tensor_shape{input.tensor_shape()};
 
     // Output dimension
     const unsigned int out_w = output_width;
@@ -729,20 +774,21 @@ inline TensorShape compute_winograd_output_transform_shape(const ITensorInfo &in
 
 /** Calculate the deep convolution shape output shape of a tensor
  *
- * @param[in] input     Input tensor info
- * @param[in] weights   Weights tensor info
- * @param[in] conv_info Contains padding and stride information
+ * @param[in] input_shape       Input tensor shape
+ * @param[in] input_data_layout Input data layout
+ * @param[in] weights_shape     Weights tensor shape
+ * @param[in] conv_info         Contains padding and stride information
  *
  * @return the calculated shape
  */
-inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
+inline TensorShape compute_deep_convolution_shape(const TensorShape   &input_shape,
+                                                  DataLayout           input_data_layout,
+                                                  const TensorShape   &weights_shape,
+                                                  const PadStrideInfo &conv_info)
 {
-    const TensorShape input_shape{ input.tensor_shape() };
-    const TensorShape weights_shape{ weights.tensor_shape() };
-
-    const size_t idx_width   = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
-    const size_t idx_channel = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
+    const size_t idx_width   = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_height  = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_channel = get_data_layout_dimension_index(input_data_layout, DataLayoutDimension::CHANNEL);
 
     const unsigned int input_width         = input_shape[idx_width];
     const unsigned int input_height        = input_shape[idx_height];
@@ -751,9 +797,10 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
     const unsigned int weights_out_channel = weights_shape[3];
     unsigned int       output_width        = 0;
     unsigned int       output_height       = 0;
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, weights_width, weights_height, conv_info);
 
-    TensorShape output_shape{ input_shape };
+    TensorShape output_shape{input_shape};
     output_shape.set(idx_width, output_width);
     output_shape.set(idx_height, output_height);
     output_shape.set(idx_channel, weights_out_channel);
@@ -761,6 +808,53 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
     return output_shape;
 }
 
+/** Calculate the deep convolution shape output shape of a tensor
+ *
+ * @param[in] input     Input tensor info
+ * @param[in] weights   Weights tensor info
+ * @param[in] conv_info Contains padding and stride information
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info)
+{
+    return compute_deep_convolution_shape(input.tensor_shape(), input.data_layout(), weights.tensor_shape(), conv_info);
+}
+
+/** Calculate the indirect buffer output shape used by the indirect convolution function
+ *
+ * @param[in] input_shape       Input tensor shape
+ * @param[in] input_data_layout Input data layout
+ * @param[in] weights_shape     Weights tensor shape
+ * @param[in] conv_info         Contains padding and stride information
+ * @param[in] desc              Contains the direct/indirect convolution compute arguments, such as the tiling dimensions
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_indirect_buffer_shape(const TensorShape                 &input_shape,
+                                                 DataLayout                         input_data_layout,
+                                                 const TensorShape                 &weights_shape,
+                                                 const PadStrideInfo               &conv_info,
+                                                 const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(input_data_layout != DataLayout::NHWC, "The data layout can only be NHWC");
+    ARM_COMPUTE_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+
+    const unsigned int m0 = desc.m0;
+    const unsigned int kw = weights_shape[1];
+    const unsigned int kh = weights_shape[2];
+
+    TensorShape output_conv2d_shape =
+        compute_deep_convolution_shape(input_shape, input_data_layout, weights_shape, conv_info);
+
+    const unsigned int output_w = m0 * kw * kh;
+    const unsigned int output_h = DIV_CEIL(output_conv2d_shape[1] * output_conv2d_shape[2], m0);
+    const unsigned int output_b = output_conv2d_shape[3];
+
+    return TensorShape(output_w, output_h, output_b);
+}
+
 /** Calculate the min/max shape output shape of a tensor
  *
  * @param[in] input Input tensor info
@@ -769,7 +863,7 @@ inline TensorShape compute_deep_convolution_shape(const ITensorInfo &input, cons
  */
 inline TensorShape compute_min_max_shape(const ITensorInfo *input)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(Window::DimX, 2);
     output_shape.remove_dimension(1);
     output_shape.remove_dimension(1);
@@ -786,29 +880,63 @@ inline TensorShape compute_min_max_shape(const ITensorInfo *input)
  */
 inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo pool_info)
 {
-    unsigned int pooled_w = 0;
-    unsigned int pooled_h = 0;
+    int pooled_w = 0;
+    int pooled_h = 0;
+
+    TensorShape output_shape{input.tensor_shape()};
 
-    TensorShape output_shape{ input.tensor_shape() };
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    const int  idx_width         = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+    const int  idx_height        = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+    const int  input_width       = input.tensor_shape()[idx_width];
+    const int  input_height      = input.tensor_shape()[idx_height];
+    const int  pool_size_x       = is_global_pooling ? output_shape[idx_width] : pool_info.pool_size.width;
+    const int  pool_size_y       = is_global_pooling ? output_shape[idx_height] : pool_info.pool_size.height;
 
-    const bool         is_global_pooling = pool_info.is_global_pooling;
-    const unsigned int idx_width         = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int idx_height        = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned int pool_size_x       = is_global_pooling ? output_shape[idx_width] : pool_info.pool_size.width;
-    const unsigned int pool_size_y       = is_global_pooling ? output_shape[idx_height] : pool_info.pool_size.height;
+    std::tie(pooled_w, pooled_h) =
+        scaled_dimensions_signed(input_width, input_height, pool_size_x, pool_size_y, pool_info.pad_stride_info);
 
-    std::tie(pooled_w, pooled_h) = scaled_dimensions(output_shape[idx_width],
-                                                     output_shape[idx_height],
-                                                     pool_size_x,
-                                                     pool_size_y,
-                                                     pool_info.pad_stride_info);
+    ARM_COMPUTE_ERROR_ON_MSG((pooled_w < 1 || pooled_h < 1), "Calculated output dimension size is invalid");
 
-    output_shape.set(idx_width, pooled_w);
-    output_shape.set(idx_height, pooled_h);
+    output_shape.set(idx_width, static_cast<size_t>(pooled_w));
+    output_shape.set(idx_height, static_cast<size_t>(pooled_h));
 
     return output_shape;
 }
 
+/** Calculate the output unpool shape of a tensor
+ *
+ * @param[in] input     Input tensor info
+ * @param[in] pool_info Pooling layer info
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_unpool_shape(const ITensorInfo &input, PoolingLayerInfo pool_info)
+{
+    const unsigned int idx_width   = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int idx_height  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
+    const TensorShape  input_shape = input.tensor_shape();
+    ARM_COMPUTE_ERROR_ON(input_shape[idx_height] <= 1 || input_shape[idx_width] <= 1);
+    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    const unsigned int  stride_x        = pad_stride_info.stride().first;
+    const unsigned int  stride_y        = pad_stride_info.stride().second;
+
+    const int pad_left   = pad_stride_info.pad_left();
+    const int pad_top    = pad_stride_info.pad_top();
+    const int pad_right  = pad_stride_info.pad_right();
+    const int pad_bottom = pad_stride_info.pad_bottom();
+
+    TensorShape        output_shape = input_shape;
+    const unsigned int out_width =
+        (input_shape[idx_width] - 1) * stride_x - pad_left - pad_right + pool_info.pool_size.width;
+    const unsigned int out_height =
+        (input_shape[idx_height] - 1) * stride_y - pad_top - pad_bottom + pool_info.pool_size.height;
+
+    output_shape.set(idx_width, out_width);
+    output_shape.set(idx_height, out_height);
+    return output_shape;
+}
+
 /** Calculate the output roi align shape of a tensor
  *
  * @param[in] input     Input tensor info
@@ -817,9 +945,10 @@ inline TensorShape compute_pool_shape(const ITensorInfo &input, PoolingLayerInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
+inline TensorShape
+compute_roi_align_shape(const ITensorInfo &input, const ITensorInfo &rois, ROIPoolingLayerInfo pool_info)
 {
-    TensorShape output_shape{ input.tensor_shape() };
+    TensorShape output_shape{input.tensor_shape()};
 
     const unsigned int idx_width  = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
@@ -840,7 +969,7 @@ inline TensorShape compute_roi_align_shape(const ITensorInfo &input, const ITens
  */
 inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned int batch_size)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
     output_shape.set(1, batch_size);
 
     return output_shape;
@@ -855,15 +984,21 @@ inline TensorShape compute_rnn_shape(const ITensorInfo *input, const unsigned in
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info)
+inline TensorShape compute_mm_shape(const ITensorInfo     &input0,
+                                    const ITensorInfo     &input1,
+                                    bool                   is_interleaved_transposed,
+                                    const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
-    ARM_COMPUTE_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
+    ARM_COMPUTE_ERROR_ON_MSG(
+        is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(),
+        "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
 
     const bool reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
     const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : 1;
-    const int  m                        = reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
+    const int  m =
+        reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
@@ -872,7 +1007,7 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const int dim2 = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
     const int dim3 = reinterpret_input_as_3d ? 1 : input0.tensor_shape()[3];
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
     output_shape.set(0, dim0);
     output_shape.set(1, dim1);
@@ -885,15 +1020,14 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 
 /** Calculate the matrix multiplication output shape of two tensors
  *
- * @note Deprecated. Remove when GEMMReshapeInfo is not used anymore by any other kernels
- *
  * @param[in] input0    First input tensor info
  * @param[in] input1    Second input tensor info
  * @param[in] gemm_info GEMM reshape info
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMReshapeInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -902,9 +1036,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d() != 0;
     const int  depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d() : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n());
         output_shape.set(1, gemm_info.m());
@@ -931,7 +1065,8 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
  *
  * @return the calculated shape
  */
-inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
+inline TensorShape
+compute_mm_shape(const ITensorInfo &input0, const ITensorInfo &input1, const GEMMKernelInfo &gemm_info)
 {
     ARM_COMPUTE_UNUSED(input1);
     ARM_COMPUTE_ERROR_ON_MSG(input0.num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
@@ -940,9 +1075,9 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     const bool         reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     const unsigned int depth_output_gemm3d      = reinterpret_output_as_3d ? gemm_info.depth_output_gemm3d : 1;
 
-    TensorShape output_shape{ input0.tensor_shape() };
+    TensorShape output_shape{input0.tensor_shape()};
 
-    if(!reinterpret_input_as_3d && !reinterpret_output_as_3d)
+    if (!reinterpret_input_as_3d && !reinterpret_output_as_3d)
     {
         output_shape.set(0, gemm_info.n);
         output_shape.set(1, gemm_info.m);
@@ -963,20 +1098,50 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 
 /** Calculate the matrix multiplication output shape of two tensors
  *
+ * @param[in] input0      First input tensor info
+ * @param[in] input1      Second input tensor info
+ * @param[in] matmul_info Batch MatMul Kernel info to know which matrix is transposed
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_matmul_shape(const TensorShape &input0, const TensorShape &input1, const MatMulKernelInfo &matmul_info)
+{
+    TensorShape output_shape{input0};
+
+    if (matmul_info.adj_lhs)
+    {
+        output_shape.set(1, input0[0]); // The vertical (M) dimension
+    }
+
+    if (matmul_info.adj_rhs)
+    {
+        output_shape.set(0, input1[1]); // The horizontal (N) dimension
+    }
+    else
+    {
+        output_shape.set(0, input1[0]); // The horizontal (N) dimension
+    }
+
+    return output_shape;
+}
+/** Calculate the matrix multiplication output shape of two tensors
+ *
  * @param[in] input           Input tensor info
  * @param[in] gemm_3d_depth   (Optional)  GEMM 3d depth
  * @param[in] batch_size_on_z (Optional) True if batch size is on z axis
  *
  * @return the calculated shape
  */
-inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
+inline TensorShape
+compute_output_stage_shape(const ITensorInfo &input, unsigned int gemm_3d_depth = 1, bool batch_size_on_z = false)
 {
     ARM_COMPUTE_ERROR_ON(input.data_layout() != DataLayout::NHWC && gemm_3d_depth > 1);
 
     TensorShape output_shape = input.tensor_shape();
-    if(gemm_3d_depth > 1)
+    if (gemm_3d_depth > 1)
     {
-        if(batch_size_on_z)
+        if (batch_size_on_z)
         {
             output_shape.shift_right(1);
         }
@@ -1001,11 +1166,16 @@ inline TensorShape compute_output_stage_shape(const ITensorInfo &input, unsigned
  * @return the calculated shape
  */
 inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
-                                               const Coordinates &starts, const Coordinates &ends, const Coordinates &strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                               const Coordinates &starts,
+                                               const Coordinates &ends,
+                                               const Coordinates &strides,
+                                               int32_t            begin_mask,
+                                               int32_t            end_mask,
+                                               int32_t            shrink_axis_mask)
 {
     using namespace arm_compute::helpers::tensor_transform;
-    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return compute_strided_slice_output_shape(input.tensor_shape(), starts, ends, strides, begin_mask, end_mask,
+                                              shrink_axis_mask);
 }
 
 /** Calculate the slice output shape of a tensor
@@ -1016,60 +1186,72 @@ inline TensorShape compute_strided_slice_shape(const ITensorInfo &input,
  *
  * @return the calculated shape
  */
-inline TensorShape compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
+inline TensorShape
+compute_slice_shape(const TensorShape &input_shape, const Coordinates &starts, const Coordinates &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
-    return compute_strided_slice_output_shape(input_shape,
-                                              starts, ends, BiStrides(),
-                                              0, construct_slice_end_mask(ends), 0);
+    return compute_strided_slice_output_shape(input_shape, starts, ends, BiStrides(), 0, construct_slice_end_mask(ends),
+                                              0);
 }
 
 /** Calculate the batch to space output shape of a tensor
  *
- * @param[in] input   Input tensor info
- * @param[in] block_x Block shape x value
- * @param[in] block_y Block shape y value
+ * @param[in] data_layout Data layout
+ * @param[in] input       Input tensor shape
+ * @param[in] block_x     Block shape x value
+ * @param[in] block_y     Block shape y value
+ * @param[in] crop_info   Information about how the output shape is cropped after batch to space is performed
  *
  * @return the calculated shape
  */
-inline TensorShape compute_batch_to_space_shape(const ITensorInfo *input, const int block_x, const int block_y)
+inline TensorShape compute_batch_to_space_shape(
+    DataLayout data_layout, const TensorShape &input, int block_x, int block_y, const CropInfo &crop_info = CropInfo{})
 {
-    ARM_COMPUTE_ERROR_ON(block_x <= 0 || block_y <= 0);
+    ARM_COMPUTE_ERROR_ON(block_x < 1 || block_y < 1);
 
-    const DataLayout data_layout = input->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_batch  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    TensorShape output_shape{input};
+
+    unsigned int       new_width   = input[idx_width] * static_cast<unsigned int>(block_x);
+    unsigned int       new_height  = input[idx_height] * static_cast<unsigned int>(block_y);
+    const unsigned int width_crop  = crop_info.left + crop_info.right;
+    const unsigned int height_crop = crop_info.top + crop_info.bottom;
+    ARM_COMPUTE_ERROR_ON(new_width <= width_crop);
+    ARM_COMPUTE_ERROR_ON(new_height <= height_crop);
+    new_width -= width_crop;
+    new_height -= height_crop;
 
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_x);
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_y);
-    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] / (block_x * block_y));
+    output_shape.set(idx_width, new_width);
+    output_shape.set(idx_height, new_height);
+    output_shape.set(idx_batch, input[idx_batch] / (block_x * block_y));
 
     return output_shape;
 }
 
 /** Calculate the depth to space output shape of a tensor
  *
- * @param[in] input Input tensor info
- * @param[in] block Block shape value
+ * @param[in] input_shape Input tensor shape
+ * @param[in] data_layout Operation data layout
+ * @param[in] block       Block shape value
  *
  * @return the calculated shape
  */
-inline TensorShape compute_depth_to_space_shape(const ITensorInfo *input, int block)
+inline TensorShape compute_depth_to_space_shape(const TensorShape &input_shape, DataLayout data_layout, int block)
 {
     ARM_COMPUTE_ERROR_ON(block < 2);
 
-    const DataLayout data_layout = input->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    TensorShape output_shape{ input->tensor_shape() };
-    output_shape.set(idx_width, input->dimension(idx_width) * block);
-    output_shape.set(idx_height, input->dimension(idx_height) * block);
-    output_shape.set(idx_channel, input->dimension(idx_channel) / (block * block));
+    TensorShape output_shape{input_shape};
+    output_shape.set(idx_width, input_shape[idx_width] * block);
+    output_shape.set(idx_height, input_shape[idx_height] * block);
+    output_shape.set(idx_channel, input_shape[idx_channel] / (block * block));
 
     return output_shape;
 }
@@ -1087,10 +1269,10 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     TensorShape empty_shape;
     empty_shape.set(0, 0);
 
-    TensorShape out_shape{ input->tensor_shape() };
+    TensorShape out_shape{input->tensor_shape()};
 
     // Return empty shape if axis is invalid
-    if(axis > input->tensor_shape().num_dimensions())
+    if (axis > input->tensor_shape().num_dimensions())
     {
         return empty_shape;
     }
@@ -1098,7 +1280,7 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
     size_t axis_size = out_shape[axis];
 
     // Return empty shape if num_split is not valid
-    if(axis_size % num_splits)
+    if (axis_size % num_splits)
     {
         return empty_shape;
     }
@@ -1117,18 +1299,22 @@ inline TensorShape compute_split_shape(const ITensorInfo *input, unsigned int ax
  *
  * @return the calculated shape
  */
-inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, const int block_x, const int block_y, const Size2D &padding_left, const Size2D &padding_right)
+inline TensorShape compute_space_to_batch_shape(
+    const ITensorInfo *input, int block_x, int block_y, const Size2D &padding_left, const Size2D &padding_right)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
 
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_x + padding_left.x() + padding_right.x());
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_y + padding_left.y() + padding_right.y());
-    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] / (block_x * block_y));
+    ARM_COMPUTE_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_x != 0);
+    ARM_COMPUTE_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_y != 0);
+
+    output_shape.set(idx_width, (input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) / block_x);
+    output_shape.set(idx_height, (input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) / block_y);
+    output_shape.set(idx_batch, input->tensor_shape()[idx_batch] * block_x * block_y);
 
     return output_shape;
 }
@@ -1142,16 +1328,16 @@ inline TensorShape compute_space_to_batch_shape(const ITensorInfo *input, const
  */
 inline TensorShape compute_space_to_depth_shape(const ITensorInfo *input, int32_t block_shape)
 {
-    TensorShape output_shape{ input->tensor_shape() };
+    TensorShape output_shape{input->tensor_shape()};
 
     const DataLayout data_layout = input->data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        idx_depth   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    output_shape.set(idx_width, input->tensor_shape()[idx_width] * block_shape);
-    output_shape.set(idx_height, input->tensor_shape()[idx_height] * block_shape);
-    output_shape.set(idx_depth, input->tensor_shape()[idx_depth] / (block_shape * block_shape));
+    output_shape.set(idx_width, input->tensor_shape()[idx_width] / block_shape);
+    output_shape.set(idx_height, input->tensor_shape()[idx_height] / block_shape);
+    output_shape.set(idx_depth, input->tensor_shape()[idx_depth] * (block_shape * block_shape));
 
     return output_shape;
 }
@@ -1187,7 +1373,7 @@ inline TensorShape compute_prior_box_shape(const ITensorInfo &input, const Prior
 inline TensorShape compute_padded_shape(const TensorShape &input_shape, const PaddingList &padding)
 {
     TensorShape padded_shape = input_shape;
-    for(size_t dim = 0; dim < padding.size(); ++dim)
+    for (size_t dim = 0; dim < padding.size(); ++dim)
     {
         const auto    &padding_pair   = padding[dim];
         const uint32_t shape_on_index = (padded_shape.num_dimensions() <= dim) ? 1 : input_shape[dim];
@@ -1206,7 +1392,7 @@ inline TensorShape compute_padded_shape(const TensorShape &input_shape, const Pa
 inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Multiples &multiples)
 {
     TensorShape tiled_shape = input_shape;
-    for(size_t dim = 0; dim < multiples.size(); ++dim)
+    for (size_t dim = 0; dim < multiples.size(); ++dim)
     {
         tiled_shape.set(dim, input_shape[dim] * multiples[dim]);
     }
@@ -1223,9 +1409,9 @@ inline TensorShape compute_tiled_shape(const TensorShape &input_shape, const Mul
  */
 inline TensorShape compute_reduced_shape(const TensorShape &input, unsigned int axis, bool keep_dims = true)
 {
-    TensorShape output_shape{ input };
+    TensorShape output_shape{input};
 
-    if(!keep_dims)
+    if (!keep_dims)
     {
         output_shape.remove_dimension(axis);
     }
@@ -1318,14 +1504,14 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // All dimensions must match except the axis one
-    for(unsigned int i = 0; i < MAX_DIMS; ++i)
+    for (unsigned int i = 0; i < MAX_DIMS; ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             continue;
         }
 
-        for(const auto &tensor : input)
+        for (const auto &tensor : input)
         {
             ARM_COMPUTE_ERROR_ON(tensor == nullptr);
             const TensorShape shape = extract_shape(tensor);
@@ -1336,7 +1522,7 @@ inline TensorShape calculate_concatenate_shape(const std::vector<T *> &input, si
 
     // Calculate output shape
     size_t new_size = 0;
-    for(const auto &tensor : input)
+    for (const auto &tensor : input)
     {
         const TensorShape shape = extract_shape(tensor);
         new_size += shape[axis];
@@ -1359,14 +1545,14 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
     ARM_COMPUTE_ERROR_ON(axis > a.num_dimensions());
     ARM_COMPUTE_ERROR_ON(a.num_dimensions() > 4);
 
-    TensorShape shape_out{ a.tensor_shape() };
+    TensorShape shape_out{a.tensor_shape()};
     shape_out.set(axis, num_tensors);
 
     unsigned int i_shift = 0;
 
-    for(unsigned int i = 0; i < a.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < a.num_dimensions(); ++i)
     {
-        if(i == axis)
+        if (i == axis)
         {
             i_shift++;
         }
@@ -1376,18 +1562,177 @@ inline TensorShape compute_stack_shape(const ITensorInfo &a, unsigned int axis,
     return shape_out;
 }
 
-inline TensorShape compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
+/** Calculate the output shape of 3d Convolution
+ *
+ * @param[in] src         Input tensor shape
+ * @param[in] weights     Weights tensor shape
+ * @param[in] conv3d_info 3d Convolution Parameters object
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_conv3d_shape(const TensorShape &src, const TensorShape &weights, const Conv3dInfo &conv3d_info)
+{
+    // Weight tensor shape indices (D H W Cin Cout)
+    constexpr unsigned int weights_depth_dim  = 4u;
+    constexpr unsigned int weights_height_dim = 3u;
+    constexpr unsigned int weights_width_dim  = 2u;
+    constexpr unsigned int weights_CHout_dim  = 0u;
+
+    // Source/Destination Tensor shape indices (N D H W C)
+    constexpr unsigned int batch_dim   = 4u;
+    constexpr unsigned int depth_dim   = 3u;
+    constexpr unsigned int height_dim  = 2u;
+    constexpr unsigned int width_dim   = 1u;
+    constexpr unsigned int channel_dim = 0u;
+
+    TensorShape  output_shape{src};
+    const size_t pad_left   = conv3d_info.padding.left;
+    const size_t pad_right  = conv3d_info.padding.right;
+    const size_t pad_top    = conv3d_info.padding.top;
+    const size_t pad_bottom = conv3d_info.padding.bottom;
+    const size_t pad_front  = conv3d_info.padding.front;
+    const size_t pad_back   = conv3d_info.padding.back;
+    const size_t dilation_x = conv3d_info.dilation.width;
+    const size_t dilation_y = conv3d_info.dilation.height;
+    const size_t dilation_z = conv3d_info.dilation.depth;
+    const size_t stride_x   = conv3d_info.stride.x();
+    const size_t stride_y   = conv3d_info.stride.y();
+    const size_t stride_z   = conv3d_info.stride.z();
+
+    int output_width_size  = 0;
+    int output_height_size = 0;
+    int output_depth_size  = 0;
+
+    switch (conv3d_info.round_type)
+    {
+        case DimensionRoundingType::FLOOR:
+            output_width_size =
+                static_cast<int>(std::floor((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                                (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                             stride_x) +
+                                            1));
+            output_height_size =
+                static_cast<int>(std::floor((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                                (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                             stride_y) +
+                                            1));
+            output_depth_size =
+                static_cast<int>(std::floor((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                                (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                             stride_z) +
+                                            1));
+            break;
+        case DimensionRoundingType::CEIL:
+            output_width_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[width_dim] + pad_left + pad_right -
+                                                               (dilation_x * (weights[weights_width_dim] - 1) + 1)) /
+                                            stride_x) +
+                                           1));
+            output_height_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[height_dim] + pad_top + pad_bottom -
+                                                               (dilation_y * (weights[weights_height_dim] - 1) + 1)) /
+                                            stride_y) +
+                                           1));
+            output_depth_size =
+                static_cast<int>(std::ceil((static_cast<float>(src[depth_dim] + pad_front + pad_back -
+                                                               (dilation_z * (weights[weights_depth_dim] - 1) + 1)) /
+                                            stride_z) +
+                                           1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported rounding type");
+    }
+
+    output_shape.set(batch_dim, src[batch_dim]);
+    output_shape.set(width_dim, output_width_size);
+    output_shape.set(height_dim, output_height_size);
+    output_shape.set(depth_dim, output_depth_size);
+    output_shape.set(channel_dim, weights[weights_CHout_dim]);
+    return output_shape;
+}
+
+/** Calculate the output pool3d shape of a tensor
+ *
+ * @param[in] src         Input tensor info
+ * @param[in] pool3d_info Pooling layer info
+ *
+ * @return the calculated shape
+ */
+inline TensorShape compute_pool3d_shape(const TensorShape &src, Pooling3dLayerInfo pool3d_info)
+{
+    TensorShape output_shape{src};
+
+    const auto data_layout      = DataLayout::NDHWC;
+    const int  idx_width        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int  idx_depth        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+    const int  pool_size_width  = pool3d_info.is_global_pooling ? src[idx_width] : pool3d_info.pool_size.width;
+    const int  pool_size_height = pool3d_info.is_global_pooling ? src[idx_height] : pool3d_info.pool_size.height;
+    const int  pool_size_depth  = pool3d_info.is_global_pooling ? src[idx_depth] : pool3d_info.pool_size.depth;
+    int        output_width     = 0;
+    int        output_height    = 0;
+    int        output_depth     = 0;
+
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src[idx_width], src[idx_height], src[idx_depth], pool_size_width, pool_size_height,
+                                    pool_size_depth, pool3d_info);
+
+    ARM_COMPUTE_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                             "Calculated output dimension size is invalid");
+
+    output_shape.set(idx_width, static_cast<size_t>(output_width));
+    output_shape.set(idx_height, static_cast<size_t>(output_height));
+    output_shape.set(idx_depth, static_cast<size_t>(output_depth));
+
+    return output_shape;
+}
+
+/** Calculate the gather output shape of a tensor
+ *
+ * @param[in] input_shape   Input tensor shape
+ * @param[in] indices_shape Indices tensor shape. Only supports for 2d and 3d indices
+ * @param[in] actual_axis   Axis to be used in the computation
+ *
+ * @note Let input_shape be (X,Y,Z) and indices shape (W,O,P) and axis 1
+ *       the new shape is computed by replacing the axis in the input shape with
+ *       the indice shape so the output shape will be (X,W,O,P,Z)
+ *
+ * @return the calculated shape
+ */
+inline TensorShape
+compute_gather_shape(const TensorShape &input_shape, const TensorShape &indices_shape, uint32_t actual_axis)
 {
-    ARM_COMPUTE_ERROR_ON(indices_shape.num_dimensions() > 1);
-    ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() > 4);
-    ARM_COMPUTE_ERROR_ON(actual_axis >= input_shape.num_dimensions());
+    const auto input_num_dims   = input_shape.num_dimensions();
+    const auto indices_num_dims = indices_shape.num_dimensions();
+
+    ARM_COMPUTE_ERROR_ON(actual_axis >= input_num_dims);
+    ARM_COMPUTE_ERROR_ON(input_num_dims + indices_num_dims - 1 > Coordinates::num_max_dimensions);
+
+    TensorShape output_shape;
+    size_t      dim_no = 0;
+
+    for (; dim_no < actual_axis; ++dim_no)
+    {
+        output_shape.set(dim_no, input_shape[dim_no]);
+    }
+
+    for (; dim_no < actual_axis + indices_num_dims; ++dim_no)
+    {
+        output_shape.set(dim_no, indices_shape[dim_no - actual_axis]);
+    }
+
+    for (; dim_no < input_num_dims + indices_num_dims - 1; ++dim_no)
+    {
+        output_shape.set(dim_no, input_shape[dim_no + 1 - indices_num_dims]);
+    }
 
-    TensorShape output_shape  = input_shape;
-    output_shape[actual_axis] = indices_shape[0];
+    ARM_COMPUTE_ERROR_ON(input_shape.total_size() * indices_shape.total_size() !=
+                         output_shape.total_size() * input_shape[actual_axis]);
 
     return output_shape;
 }
 } // namespace shape_calculator
 } // namespace misc
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H */
+#endif // ACL_ARM_COMPUTE_CORE_UTILS_MISC_SHAPECALCULATOR_H
diff --git a/arm_compute/core/utils/misc/Traits.h b/arm_compute/core/utils/misc/Traits.h
index 1cbdbfe16f..944fcb95f9 100644
--- a/arm_compute/core/utils/misc/Traits.h
+++ b/arm_compute/core/utils/misc/Traits.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 #define ARM_COMPUTE_UTILS_TRAITS_TRAITS_H
 
+#include "arm_compute/core/Types.h"
+
 #include <type_traits>
 
 namespace arm_compute
diff --git a/arm_compute/core/utils/misc/Utility.h b/arm_compute/core/utils/misc/Utility.h
index b2bb63f5c8..22f10d74cc 100644
--- a/arm_compute/core/utils/misc/Utility.h
+++ b/arm_compute/core/utils/misc/Utility.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,11 @@
 #ifndef ARM_COMPUTE_MISC_UTILITY_H
 #define ARM_COMPUTE_MISC_UTILITY_H
 
+#include "arm_compute/core/Error.h"
+
 #include <algorithm>
 #include <array>
+#include <cstdint>
 #include <limits>
 #include <numeric>
 #include <vector>
@@ -41,7 +44,7 @@ struct index_sequence
 };
 
 template <std::size_t N, std::size_t... S>
-struct index_sequence_generator : index_sequence_generator < N - 1, N - 1, S... >
+struct index_sequence_generator : index_sequence_generator<N - 1, N - 1, S...>
 {
 };
 
@@ -55,17 +58,17 @@ template <std::size_t N>
 using index_sequence_t = typename index_sequence_generator<N>::type;
 
 template <typename T, std::size_t N, T val, T... vals>
-struct generate_array : generate_array < T, N - 1, val, val, vals... >
+struct generate_array : generate_array<T, N - 1, val, val, vals...>
 {
 };
 
 template <typename T, T val, T... vals>
 struct generate_array<T, 0, val, vals...>
 {
-    static constexpr std::array<T, sizeof...(vals)> value{ vals... };
+    static constexpr std::array<T, sizeof...(vals)> value{vals...};
 };
 
-template <typename T, T                  val, T... vals>
+template <typename T, T val, T... vals>
 constexpr std::array<T, sizeof...(vals)> generate_array<T, 0, val, vals...>::value;
 /** @endcond */
 
@@ -76,7 +79,7 @@ template <std::size_t... S,
           typename T = std::array<typename std::iterator_traits<Iterator>::value_type, sizeof...(S)>>
 T make_array(Iterator first, index_sequence<S...>)
 {
-    return T{ { first[S]... } };
+    return T{{first[S]...}};
 }
 } // namespace detail
 
@@ -84,7 +87,7 @@ template <std::size_t N, typename Iterator>
 std::array<typename std::iterator_traits<Iterator>::value_type, N> make_array(Iterator first, Iterator last)
 {
     ARM_COMPUTE_UNUSED(last);
-    return detail::make_array(first, index_sequence_t<N> {});
+    return detail::make_array(first, index_sequence_t<N>{});
 }
 
 /** Performs clamping among a lower and upper value.
@@ -116,7 +119,7 @@ inline void for_each(F &&)
  * @param[in] args Remaining arguments
  */
 template <typename F, typename T, typename... Ts>
-inline void for_each(F &&func, T &&arg, Ts &&... args)
+inline void for_each(F &&func, T &&arg, Ts &&...args)
 {
     func(std::forward<T>(arg));
     for_each(std::forward<F>(func), std::forward<Ts>(args)...);
@@ -140,9 +143,11 @@ inline T &&foldl(F &&, T &&value)
  * @param[in] values  Remaining arguments
  */
 template <typename F, typename T, typename U, typename... Us>
-inline auto foldl(F &&func, T &&initial, U &&value, Us &&... values) -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
+inline auto foldl(F &&func, T &&initial, U &&value, Us &&...values)
+    -> decltype(func(std::forward<T>(initial), std::forward<U>(value)))
 {
-    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)), std::forward<Us>(values)...);
+    return foldl(std::forward<F>(func), func(std::forward<T>(initial), std::forward<U>(value)),
+                 std::forward<Us>(values)...);
 }
 
 /** Perform an index sort of a given vector.
@@ -157,11 +162,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
     std::vector<size_t> idx(v.size());
     std::iota(idx.begin(), idx.end(), 0);
 
-    std::sort(idx.begin(), idx.end(),
-              [&v](size_t i1, size_t i2)
-    {
-        return v[i1] < v[i2];
-    });
+    std::sort(idx.begin(), idx.end(), [&v](size_t i1, size_t i2) { return v[i1] < v[i2]; });
 
     return idx;
 }
@@ -175,7 +176,7 @@ std::vector<size_t> sort_indices(const std::vector<T> &v)
  */
 inline bool endswith(const std::string &str, const std::string &suffix)
 {
-    if(str.size() < suffix.size())
+    if (str.size() < suffix.size())
     {
         return false;
     }
@@ -202,12 +203,28 @@ inline bool check_aligned(void *ptr, const size_t alignment)
  */
 inline std::string tolower(std::string string)
 {
-    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c)
-    {
-        return std::tolower(c);
-    });
+    std::transform(string.begin(), string.end(), string.begin(), [](unsigned char c) { return std::tolower(c); });
     return string;
 }
+
+/** Get environment variable as a string
+ *
+ * @note Return empty string on bare-metal
+ *
+ * @param[in] env_name Name of the Environment variable to retrieve
+ *
+ * @return Environment variable content, or empty string if the variable is undefined or on bare-metal
+ */
+inline std::string getenv(const std::string &env_name)
+{
+#ifdef BARE_METAL
+    ARM_COMPUTE_UNUSED(env_name);
+    return std::string{};
+#else  // BARE_METAL
+    const auto env_chr = std::getenv(env_name.c_str());
+    return env_chr == nullptr ? std::string{} : std::string{env_chr};
+#endif // BARE_METAL
+}
 } // namespace utility
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_MISC_UTILITY_H */
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 4ef49476b2..2324fe1838 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,10 @@ namespace quantization
  *
  * @return a status
  */
-Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier(float    multiplier,
+                                      int32_t *quant_multiplier,
+                                      int32_t *shift,
+                                      bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier with value less than one.
  *
  * @param[in]  multiplier       Real multiplier.
@@ -51,7 +54,10 @@ Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplie
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon = false);
+Status calculate_quantized_multiplier_less_than_one(float    multiplier,
+                                                    int32_t *quant_multiplier,
+                                                    int32_t *right_shift,
+                                                    bool     ignore_epsilon = false);
 /** Calculate quantized representation of multiplier having value greater than one.
  *
  * @param[in]  multiplier           Real multiplier.
@@ -60,7 +66,8 @@ Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *q
  *
  * @return a status
  */
-Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift);
 
 /** Calculate quantized representation of per-channel multipliers
  *
@@ -71,9 +78,9 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t
  *
  * @return a status
  */
-Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                       const QuantizationInfo &wq_info,
-                                       const QuantizationInfo &oq_info,
+Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                       const QuantizationInfo  &wq_info,
+                                       const QuantizationInfo  &oq_info,
                                        GEMMLowpOutputStageInfo &stage_info);
 
 /** Get minimum and maximum values for the input quantized data type
@@ -81,6 +88,7 @@ Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
  * @return min and max values for the quantized data type
  */
 std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_type);
+
 /** Compute quantized per-channel multipliers and shifts. As many multipliers
  * and shifts as output channels are computed. If weights are not quantized
  * per-channel, multipliers and shifts will end up being the same for each
@@ -89,16 +97,12 @@ std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_ty
  * @param[in]  input                  Input tensor info.
  * @param[in]  weights                Weights tensor info.
  * @param[in]  output                 Output tensor info.
- * @param[in]  idx_ofms               Dimension index to get OFMs from the weights tensor.
  * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers.
  * @param[out] output_shifts_ptr      Pointer to the buffer where to store per-channel shifts.
- *
- * @return min and max values for the quantized data type
  */
 void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
                                               const ITensorInfo *weights,
                                               const ITensorInfo *output,
-                                              unsigned int       idx_ofms,
                                               int32_t           *output_multipliers_ptr,
                                               int32_t           *output_shifts_ptr);
 
@@ -150,7 +154,10 @@ int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v);
  * @param[out] output_shift    Shift for inverse square root
  *
  */
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift);
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift);
 
 } // namespace quantization
 } // namespace arm_compute
diff --git a/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
new file mode 100644
index 0000000000..6b92f12223
--- /dev/null
+++ b/arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
+#include <map>
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLTensor;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+
+/** OpenCL runtime to run a workload
+ */
+class ClWorkloadRuntime
+{
+public:
+    /** Default constructor. */
+    ClWorkloadRuntime();
+
+    /** Destructor */
+    ~ClWorkloadRuntime();
+
+    /** Move constructor */
+    ClWorkloadRuntime(ClWorkloadRuntime &&);
+
+    /** Move assignment */
+    ClWorkloadRuntime &operator=(ClWorkloadRuntime &&);
+
+    /** Configure @ref ClWorkloadRuntime
+     * @note A runtime cannot be re-configured
+     *
+     * @param[in] sketch @ref GpuWorkloadSketch with which to configure
+     */
+    Status configure(const GpuWorkloadSketch &sketch);
+    /** Perform run workload
+     * @note If the runtime is not configured, this method will not perform any action
+     *
+     * @param[in,out] tensors Tensors required by the run workloads
+     *
+     * @return Status If the run is successful
+     */
+    Status run(const std::vector<CLTensor *> &tensors);
+    /** Get auxiliary tensors of the workload and their memory requirement
+     */
+    std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> get_auxiliary_tensors();
+
+private:
+    /** Enqueue prepare workload
+     * @note If the runtime is not configured, this method will not perform any action
+     */
+    void prepare();
+    struct Implementation;
+    std::unique_ptr<Implementation> _impl;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_RUNTIME_GPU_CL_CLWORKLOADRUNTIME_H
diff --git a/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
new file mode 100644
index 0000000000..e2a34e4424
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Type of memory used by a workload tensor
+ *
+ *  We can classify tensors in 2 dimensions: Topology (where they are in a workload) and Memory allocation:
+ * Topology:
+ *      Argument tensors: "Outer" tensors exposed to the users as inputs and outputs (arguments)
+ *      Intermediate tensors: "Inner" tensors hidden from the users as links between operators
+ * Memory allocation:
+ *      Alloc: Tensors that need to be allocated real backing memory
+ *      No-Alloc: Tensors that don't need to be allocated real backing memory
+ *
+ * We end up with 3 MemoryType based on the product of these two classifications
+ *          |    Argument    |   Intermediate    |
+ * ---------*----------------*-------------------*
+ * Alloc    |     User       |   Auxiliary       |
+ * ---------*----------------*-------------------*
+ * No-Alloc *     N/A        |    Virtual        |
+ * ---------*----------------*-------------------*
+ */
+enum class MemoryType
+{
+    /** Both User and Auxiliary types are of Alloc type. Since they require memory allocation */
+    User = 0, /**< Memory coming directly from users, e.g. for argument tensors */
+    Auxiliary =
+        1, /**< Additional memory required by the workload tensor, e.g. for tensors holding temporary results between kernels */
+    /** Virtual type is of No-Alloc type. Since it doesn't require memory allocation */
+    Virtual =
+        2, /**< Temporary tile which is not allocated as a whole tensor in the memory. It is mainly used at sketch time to link operators; there should be no Virtual tensors at runtime */
+};
+
+/** Memory information for tensors with @ref MemoryType::Auxiliary.
+ * This informs how much additional memory is required for auxiliary tensors
+ */
+struct AuxMemoryInfo
+{
+    AuxMemoryInfo() = default;
+
+    AuxMemoryInfo(size_t size, size_t alignment = 0) noexcept : size(size), alignment(alignment)
+    {
+    }
+
+    friend bool operator==(const AuxMemoryInfo &info0, const AuxMemoryInfo &info1)
+    {
+        return info0.size == info1.size && info0.alignment == info1.alignment;
+    }
+    size_t size{0};      /**< Total memory size in bytes */
+    size_t alignment{0}; /**< Memory alignment in bytes */
+};
+
+/** Descriptor of a workload tensor memory */
+struct MemoryDescriptor
+{
+    MemoryType    memory_type{};     /**< Memory Type*/
+    AuxMemoryInfo aux_memory_info{}; /**< Auxiliary Tensor Memory Information */
+};
+
+/** A map from @ref ITensorInfo to their corresponding @ref MemoryDescriptor */
+using MemoryDescriptorMap = std::map<ITensorInfo::Id, MemoryDescriptor>;
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_MEMORYDESCRIPTOR */
diff --git a/arm_compute/runtime/MultiHOG.h b/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
index 72edf3a416..ba2f658c7c 100644
--- a/arm_compute/runtime/MultiHOG.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,38 +21,39 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MULTIHOG_H
-#define ARM_COMPUTE_MULTIHOG_H
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CASTATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CASTATTRIBUTES
 
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IMultiHOG.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/HOG.h"
-
-#include <memory>
 
 namespace arm_compute
 {
-/** CPU implementation of multi HOG data-object */
-class MultiHOG : public IMultiHOG
+namespace experimental
 {
-public:
-    /** Constructor
-     *
-     * @param[in] num_models Number of HOG data objects to contain
-     *
-     */
-    MultiHOG(size_t num_models);
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
 
-    // Inherited methods overridden:
-    size_t num_models() const override;
-    IHOG *model(size_t index) override;
-    const IHOG *model(size_t index) const override;
+/** Cast attributes */
+class CastAttributes
+{
+public:
+    /** Set Data Type to be casted to */
+    CastAttributes &data_type(const DataType &data_type);
+    /** Get Data Type to be casted to */
+    DataType data_type() const;
+    /** Set Policy */
+    CastAttributes &convert_policy(const ConvertPolicy &policy);
+    /** Get Policy */
+    ConvertPolicy convert_policy() const;
 
 private:
-    size_t           _num_models;
-    std::vector<HOG> _model;
+    DataType      _data_type{};                             /**< Data Type to be casted to */
+    ConvertPolicy _convert_policy{ConvertPolicy::SATURATE}; /**< Convert Policy */
 };
-}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
 
-#endif /* ARM_COMPUTE_MULTIHOG_H */
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CASTATTRIBUTES */
diff --git a/arm_compute/core/AccessWindowTranspose.h b/arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h
index 16105bce7c..70e449a656 100644
--- a/arm_compute/core/AccessWindowTranspose.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,28 +21,43 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H
-#define ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H
 
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorShape.h"
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CLAMPATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CLAMPATTRIBUTES
+
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
-class Window;
-class ITensorInfo;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
 
-/** Implementation of a XY-transpose access pattern. */
-class AccessWindowTranspose : public AccessWindowRectangle
+/** Clamp attributes */
+class ClampAttributes
 {
 public:
-    using AccessWindowRectangle::AccessWindowRectangle;
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    using AccessWindowRectangle::compute_valid_region;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    /** Set the minimum clip value */
+    ClampAttributes &min_val(const float &min_val);
+
+    /** Get the minimum clip value */
+    float min_val() const;
+
+    /** Set the maximum clip value */
+    ClampAttributes &max_val(const float &max_val);
+
+    /** Get the maximum clip value */
+    float max_val() const;
+
+private:
+    float _min_val{};
+    float _max_val{};
 };
+
+} // namespace dynamic_fusion
+} // namespace experimental
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CLAMPATTRIBUTES */
diff --git a/arm_compute/runtime/NEON/functions/NEDilate.h b/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
index 39a37af74c..a98ef0363e 100644
--- a/arm_compute/runtime/NEON/functions/NEDilate.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,35 +21,46 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDILATE_H
-#define ARM_COMPUTE_NEDILATE_H
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CONV2DATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CONV2DATTRIBUTES
 
+#include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
 
 #include <cstdint>
 
 namespace arm_compute
 {
-class ITensor;
-
-/** Basic function to execute dilate. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEDilateKernel
- *
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
  */
-class NEDilate : public INESimpleFunction
+
+/** Conv2d attributes */
+class Conv2dAttributes
 {
 public:
-    /** Initialise the kernel's inputs, output and border mode.
-     *
-     * @param[in, out] input                 First tensor input. Data type supported: U8.(Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor. Data type supported: U8.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
+    /** Set padding */
+    Conv2dAttributes &pad(const Padding2D &pad);
+    /** Get padding */
+    Padding2D pad() const;
+    /** Set stride */
+    Conv2dAttributes &stride(const Size2D &stride);
+    /** Get stride */
+    Size2D stride() const;
+    /** Set dilation */
+    Conv2dAttributes &dilation(const Size2D &dilation);
+    /** Get dilation */
+    Size2D dilation() const;
+
+private:
+    Padding2D _pad{};            /**< Padding */
+    Size2D    _stride{1U, 1U};   /**< Stride */
+    Size2D    _dilation{1U, 1U}; /**< Dilation */
 };
-}
-#endif /*ARM_COMPUTE_NEDILATE_H */
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_CONV2DATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
new file mode 100644
index 0000000000..c46b25cb5d
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_DEPTHWISECONV2DATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_DEPTHWISECONV2DATTRIBUTES
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** Depthwise Conv2d attributes */
+class DepthwiseConv2dAttributes
+{
+public:
+    /** Set padding */
+    DepthwiseConv2dAttributes &pad(const Padding2D &pad);
+    /** Get padding */
+    Padding2D pad() const;
+    /** Set stride */
+    DepthwiseConv2dAttributes &stride(const Size2D &stride);
+    /** Get stride */
+    Size2D stride() const;
+    /** Set dilation */
+    DepthwiseConv2dAttributes &dilation(const Size2D &dilation);
+    /** Get dilation */
+    Size2D dilation() const;
+    /** Set depth multiplier */
+    DepthwiseConv2dAttributes &depth_multiplier(const uint32_t &depth_multiplier);
+    /** Get depth multiplier */
+    uint32_t depth_multiplier() const;
+    /** Set Dimension rounding type */
+    DepthwiseConv2dAttributes &dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type);
+    /** Get Dimension rounding type */
+    DimensionRoundingType dimension_rounding_type() const;
+
+private:
+    Padding2D             _pad{};                                                 /**< Padding */
+    Size2D                _stride{1U, 1U};                                        /**< Stride */
+    Size2D                _dilation{1U, 1U};                                      /**< Dilation */
+    uint32_t              _depth_multiplier{1U};                                  /**< Depth multiplier */
+    DimensionRoundingType _dimension_rounding_type{DimensionRoundingType::FLOOR}; /**< Dimension rounding type */
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_DEPTHWISECONV2DATTRIBUTES */
diff --git a/arm_compute/runtime/CL/CLMultiHOG.h b/arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h
index 45297c75e2..fc512e9ff9 100644
--- a/arm_compute/runtime/CL/CLMultiHOG.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,36 +21,42 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLMULTIHOG_H
-#define ARM_COMPUTE_CLMULTIHOG_H
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_MATMULATTRIBUTES_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_MATMULATTRIBUTES_H
 
-#include "arm_compute/core/CL/ICLMultiHOG.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLHOG.h"
-
-#include <memory>
 
 namespace arm_compute
 {
-/** Basic implementation of the CL multi HOG data-objects */
-class CLMultiHOG : public ICLMultiHOG
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** MatMul attributes */
+class MatMulAttributes
 {
 public:
-    /** Constructor
-     *
-     * @param[in] num_models Number of HOG data objects to contain
-     *
-     */
-    CLMultiHOG(size_t num_models);
-
-    // Inherited methods overridden:
-    size_t  num_models() const override;
-    ICLHOG *cl_model(size_t index) override;
-    const ICLHOG *cl_model(size_t index) const override;
+    /* Get adjoint LHS flag value (transpose LHS before multiplication) */
+    bool adj_lhs() const;
+
+    /* Get adjoint RHS flag value (transpose RHS before multiplication) */
+    bool adj_rhs() const;
+
+    /* Set adjoint LHS flag value (transpose LHS before multiplication) */
+    MatMulAttributes adj_lhs(bool adj_lhs);
+
+    /* Set adjoint RHS flag value (transpose RHS before multiplication) */
+    MatMulAttributes adj_rhs(bool adj_rhs);
 
 private:
-    size_t             _num_models;
-    std::vector<CLHOG> _model;
+    bool _adj_lhs{false};
+    bool _adj_rhs{false};
 };
-}
-#endif /*ARM_COMPUTE_CLMULTIHOG_H */
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_MATMULATTRIBUTES_H
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
new file mode 100644
index 0000000000..19d8b96dcf
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_POOL2DATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_POOL2DATTRIBUTES
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+class Pool2dAttributes
+{
+public:
+    /* Get Pooling Type */
+    PoolingType pool_type() const;
+
+    /* Set Pooling Type */
+    Pool2dAttributes pool_type(PoolingType pool_type);
+
+    /* Get 2D Pool Size */
+    Size2D pool_size() const;
+
+    /* Set 2D Pool size */
+    Pool2dAttributes pool_size(const Size2D &pool_size);
+
+    /* Get Padding */
+    Padding2D pad() const;
+
+    /* Set Padding */
+    Pool2dAttributes pad(const Padding2D &padding);
+
+    /* Get Stride */
+    Size2D stride() const;
+
+    /* Set Stride */
+    Pool2dAttributes stride(const Size2D &stride);
+
+    /* Get exclude padding */
+    bool exclude_padding() const;
+
+    /* Set exclude padding */
+    Pool2dAttributes exclude_padding(bool exclude_padding);
+
+private:
+    PoolingType _pool_type{};
+    Padding2D   _pad{};
+    Size2D      _pool_size{};
+    Size2D      _stride{1U, 1U};
+    bool        _exclude_padding{true};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_POOL2DATTRIBUTES */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h b/arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h
index 10d9c378ae..b99192e9f2 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,29 +21,33 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H
-#define ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_RESHAPEATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_RESHAPEATTRIBUTES
+#include "arm_compute/core/TensorShape.h"
 
 namespace arm_compute
 {
-class ITensor;
-
-/** Basic function to execute NEGEMMInterleave4x4Kernel. This function calls the following NEON kernel:
- *
- *  -# @ref NEGEMMInterleave4x4Kernel
- *
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
  */
-class NEGEMMInterleave4x4 : public INESimpleFunctionNoBorder
+
+/** Reshape attributes */
+class ReshapeAttributes
 {
 public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input  First input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
+    /** Set destination shape to reshape to */
+    ReshapeAttributes &shape(const TensorShape &shape);
+    /** Get destination shape to reshape to */
+    TensorShape shape() const;
+
+private:
+    TensorShape _shape{};
 };
+} // namespace dynamic_fusion
+} // namespace experimental
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMINTERLEAVE4X4_H */
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_RESHAPEATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h b/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
new file mode 100644
index 0000000000..7410cc7e70
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_RESIZEATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_RESIZEATTRIBUTES
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
+
+/** Resize attributes */
+class ResizeAttributes
+{
+public:
+    /** Set output width */
+    ResizeAttributes &output_width(int32_t output_width);
+
+    /** Get output width */
+    int32_t output_width() const;
+
+    /** Set output height */
+    ResizeAttributes &output_height(int32_t output_height);
+
+    /** Get output height */
+    int32_t output_height() const;
+
+    /** Set interpolation policy */
+    ResizeAttributes &interpolation_policy(InterpolationPolicy interpolation_policy);
+
+    /** Get interpolation policy */
+    InterpolationPolicy interpolation_policy() const;
+
+    /** Set sampling policy */
+    ResizeAttributes &sampling_policy(SamplingPolicy sampling_policy);
+
+    /** Get sampling policy */
+    SamplingPolicy sampling_policy() const;
+
+    /** Set align corners */
+    ResizeAttributes &align_corners(bool align_corners);
+
+    /** Get align corners */
+    bool align_corners() const;
+
+private:
+    int32_t             _output_width{};
+    int32_t             _output_height{};
+    InterpolationPolicy _interpolation_policy{InterpolationPolicy::BILINEAR};
+    SamplingPolicy      _sampling_policy{SamplingPolicy::CENTER};
+    bool                _align_corners{false};
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_RESIZEATTRIBUTES */
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h b/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
index d9006e78f1..61654bfa27 100644
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h
+++ b/arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,33 +21,42 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTIONVALHALL_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTIONVALHALL_H
-
-#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_SOFTMAXATTRIBUTES
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_SOFTMAXATTRIBUTES
 
 namespace arm_compute
 {
-namespace cl_gemm
+namespace experimental
 {
-/** Valhall based OpenCL GEMMKernel selection */
-class CLGEMMKernelSelectionValhall final : public ICLGEMMKernelSelection
+namespace dynamic_fusion
 {
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMKernelSelectionValhall(GPUTarget gpu);
+/** Attributes are backend-agnostic parameters (in addition to the input/output tensors) of an operator.
+ */
 
-    // Inherited overridden method
-    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
+/** Softmax attributes */
+class SoftmaxAttributes
+{
+public:
+    /** Set beta */
+    SoftmaxAttributes &beta(float beta);
+    /** Get beta */
+    float beta() const;
+    /** Set is_log */
+    SoftmaxAttributes &is_log_softmax(bool is_log);
+    /** Get is_log */
+    bool is_log_softmax() const;
+    /** Set axis */
+    SoftmaxAttributes &axis(int axis);
+    /** Get axis */
+    int axis() const;
 
 private:
-    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
+    float _beta{1.f};        /**< Scaling factor for the exponent */
+    bool  _is_log_softmax{}; /**< True if operation is log-softmax */
+    int   _axis{};           /**< Axis/Dimension to perform the operation */
 };
-} // namespace cl_gemm
+} // namespace dynamic_fusion
+} // namespace experimental
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTIONVALHALL_H */
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_ATTRIBUTES_SOFTMAXATTRIBUTES */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
new file mode 100644
index 0000000000..76e425513e
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Forward declaration */
+class CLCompileContext;
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Gpu Information such as the Gpu target (for example, G76) */
+using GpuTarget = ::arm_compute::GPUTarget;
+
+/** Gpu Language */
+enum class GpuLanguage
+{
+    OpenCL,
+    Unknown
+};
+/** Provide context necessary for the creation and configuration of a workload
+ * e.g. gpu targets and capabilities, cl::Device for querying OpenCl extensions. Both can affect how a kernel is generated
+ *
+ * This context is shared between different operators within a sketch, and has to stay valid for the entire workload creation session.
+ * This context may also be shared between different sketches.
+ *
+ * This class only contains information for workload creation, but not for runtime (e.g. cl::Queue for enqueueing the kernels)
+ */
+class GpuWorkloadContext
+{
+public:
+    class Impl;
+
+    /** Constructor */
+    GpuWorkloadContext(CLCompileContext *cl_compile_context);
+    /** Destructor */
+    ~GpuWorkloadContext();
+    /** Prohibit instances of this class to be copy constructed */
+    GpuWorkloadContext(const GpuWorkloadContext &config) = delete;
+    /** Prohibit instances of this class to be copied */
+    GpuWorkloadContext &operator=(const GpuWorkloadContext &config) = delete;
+    /** Allow instances of this class to be move constructed */
+    GpuWorkloadContext(GpuWorkloadContext &&config);
+    /** Allow instances of this class to be moved */
+    GpuWorkloadContext &operator=(GpuWorkloadContext &&config);
+    /** Get @ref GpuLanguage of the context */
+    GpuLanguage gpu_language() const;
+    /** Get @ref GpuTarget of the context */
+    GpuTarget gpu_target() const;
+    /** Get @ref CLCompileContext
+     * If the gpu language is not OpenCL, then return nullptr
+     */
+    const CLCompileContext *cl_compile_context() const;
+
+    /** Create a @ref TensorInfo associated with the workload context.
+     *
+     * @return TensorInfo Newly created tensor info
+     */
+    template <typename... TArgs>
+    ITensorInfo *create_tensor_info(TArgs &&...args)
+    {
+        auto  tensor_info     = std::make_unique<TensorInfo>(std::forward<TArgs>(args)...);
+        auto *tensor_info_ptr = tensor_info.get();
+
+        register_user_tensor(std::move(tensor_info));
+
+        return tensor_info_ptr;
+    }
+
+    /** Get the internal implementation */
+    Impl &implementation();
+
+    /** Get the internal implementation */
+    const Impl &implementation() const;
+
+private:
+    /** Set a new ID to the tensor info and register its memory descriptor to the context.
+     *
+     * The ownership of the tensor info object will be transfered to this context object.
+     *
+     * @param[in] tensor_info @ref TensorInfo to be registered.
+     */
+    void register_user_tensor(std::unique_ptr<TensorInfo> &&tensor_info);
+
+    /** Internal implementation */
+    std::unique_ptr<Impl> _impl;
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADCONTEXT_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
new file mode 100644
index 0000000000..1c738bd060
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH_H
+
+#include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** A descriptor of a workload of operators
+ *
+ * A "workload" is a basic unit of computation to schedule and perform. It contains one or more operators that can be "fused" together.
+ * Note that a workload may still contain multiple kernels.
+ */
+class GpuWorkloadSketch
+{
+public:
+    /** Global context used for the creation of a workload */
+    using Context = GpuWorkloadContext;
+    /** Internal opaque implementation */
+    class Implementation;
+
+public:
+    /** Constructor
+     *
+     * @param[in] context Gpu context for the creation of a workload
+     */
+    explicit GpuWorkloadSketch(GpuWorkloadContext *context);
+
+    /** Destructor */
+    ~GpuWorkloadSketch();
+
+    /** Move constructor */
+    GpuWorkloadSketch(GpuWorkloadSketch &&);
+
+    /** Move assignment */
+    GpuWorkloadSketch &operator=(GpuWorkloadSketch &&);
+
+    /** Get the implementation */
+    Implementation &implementation();
+
+    /** Get the implementation */
+    const Implementation &implementation() const;
+
+    /** Get the gpu workload context of this sketch */
+    const GpuWorkloadContext *gpu_context() const;
+
+    /** Get the gpu workload context of this sketch */
+    GpuWorkloadContext *gpu_context();
+
+private:
+    std::unique_ptr<Implementation> _impl; /**< Internal opaque implementation*/
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCH_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
new file mode 100644
index 0000000000..0030528884
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD_H
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuAdd final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst           |
+     * |:--------------|:--------------|:-------------|
+     * |F16            |F16            |F16           |
+     * |F32            |F32            |F32           |
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch Workload sketch into which the operator will be fused
+     * @param[in]     lhs    Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in]     rhs    Right hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] lhs     Left hand side tensor info.
+     * @param[in] rhs     Right hand side tensor info.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuAdd::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUADD_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
new file mode 100644
index 0000000000..9735dcf14b
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuCast final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = CastAttributes;
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst                                    |
+     * |:--------------|:--------------------------------------|
+     * |F16            | F32                                   |
+     * |F32            | F16                                   |
+     *
+     * Input data type must be different than output data type.
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Left hand side tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]     attributes Operator attributes
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Left hand side tensor info. Data types supported: All.
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+     *
+     *  Parameters are similar to @ref GpuCast::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCAST_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
new file mode 100644
index 0000000000..5dd77bdc8e
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuClamp.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCLAMP
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCLAMP
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ClampAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuClamp final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = ClampAttributes;
+
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @param[in, out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]      src        Source tensor info. Data types supported: F16/F32.
+     * @param[in]      attributes Operator attributes
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Source tensor info. Data types supported: F16/F32.
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
+
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuClamp::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif //ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCLAMP
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
new file mode 100644
index 0000000000..da7e860757
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuConv2d final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = Conv2dAttributes;
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |wei            |bia            |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor
+     * @param[in]     wei        Weight tensor
+     * @param[in]     bia        (Optional) Bias tensor
+     * @param[in]     attributes Operator attributes
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(
+        GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Attributes &attributes);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Source tensor
+     * @param[in] wei        Weight tensor
+     * @param[in] bia        (Optional) Bias tensor
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const ITensorInfo        *wei,
+                                  const ITensorInfo        *bia,
+                                  const Attributes         &attributes);
+    /** Check if the operator configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuConv2d::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const Attributes        &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUCONV2D */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
new file mode 100644
index 0000000000..958569efd7
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUDEPTHWISECONV2D
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUDEPTHWISECONV2D
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuDepthwiseConv2d final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = DepthwiseConv2dAttributes;
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |wei            |bia            |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor
+     * @param[in]     wei        Weight tensor
+     * @param[in]     bia        (Optional) Bias tensor
+     * @param[in]     attributes Operator attributes
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(
+        GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Attributes &attributes);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Source tensor
+     * @param[in] wei        Weight tensor
+     * @param[in] bia        (Optional) Bias tensor
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const ITensorInfo        *wei,
+                                  const ITensorInfo        *bia,
+                                  const Attributes         &attributes);
+
+    /** Check if the operator configuration is supported and if it can be fused into the workload sketch.
+     *
+     *  Parameters are similar to @ref GpuDepthwiseConv2d::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *wei,
+                              const ITensorInfo       *bia,
+                              const Attributes        &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUDEPTHWISECONV2D */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h
new file mode 100644
index 0000000000..6ba5b1db93
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMatMul.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMATMUL_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMATMUL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/MatMulAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator backend specific settings
+*/
+class GpuMatMulSettings
+{
+public:
+    /** Set N0: number of columns processed by each work-item */
+    GpuMatMulSettings &n0(int n0);
+    /** Get N0: number of columns processed by each work-item */
+    int n0() const;
+
+    /** Set M0: number of rows processed by each work-item */
+    GpuMatMulSettings &m0(int m0);
+    /** Get M0: number of rows processed by each work-item */
+    int m0() const;
+
+    /** Set K0: number of inner accumulations */
+    GpuMatMulSettings &k0(int k0);
+    /** Get K0: number of inner accumulations */
+    int k0() const;
+
+private:
+    int _n0{0}; /**< Number of columns processed by each work-item */
+    int _m0{0}; /**< Number of rows processed by each work-item */
+    int _k0{0}; /**< Number of inner accumulations */
+};
+
+/** Operator interface. */
+class GpuMatMul final
+{
+public:
+    /* Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = MatMulAttributes;
+    /* Settings are a set of backend-specific parameters that influence the implementation of a operator */
+    using Settings = GpuMatMulSettings;
+
+    /* Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |LHS            |RHS            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     lhs        Input tensor info for the LHS matrix. Data type supported: F32/F16. Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]     rhs        Input tensor info for the RHS matrix. Data type supported: same as @p lhs. Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]     attributes Operator attributes
+     * @param[in]     settings   Operator settings
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch,
+                                  ITensorInfo       *lhs,
+                                  ITensorInfo       *rhs,
+                                  const Attributes  &attributes,
+                                  const Settings    &settings);
+
+    /* Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] lhs        Input tensor info for the LHS matrix.
+     * @param[in] rhs        Input tensor info for the RHS matrix.
+     * @param[in] attributes Operator attributes
+     * @param[in] settings   Operator settings
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *lhs,
+                                  const ITensorInfo        *rhs,
+                                  const Attributes         &attributes,
+                                  const Settings           &settings);
+
+    /* Check if the operator configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuMatMul::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *lhs,
+                              const ITensorInfo       *rhs,
+                              const Attributes        &attributes,
+                              const Settings          &settings);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMATMUL_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
new file mode 100644
index 0000000000..d13e4a3cad
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMUL
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMUL
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuMul final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst           |
+     * |:--------------|:--------------|:-------------|
+     * |F16            |F16            |F16           |
+     * |F32            |F32            |F32           |
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch Workload sketch into which the operator will be fused
+     * @param[in]     lhs    Left hand side tensor info. Data types supported: F16/F32.
+     * @param[in]     rhs    Right hand side tensor info. Data types supported: Same as @p lhs.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] lhs     Left hand side tensor info. Data types supported: F16/F32.
+     * @param[in] rhs     Right hand side tensor info. Data types supported: Same as @p lhs.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
+
+    /** Validate the operator and check if the configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuMul::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUMUL */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
new file mode 100644
index 0000000000..deb5559b9d
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUOUTPUT
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUOUTPUT
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuOutput final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     *   - Any
+     *
+     * Valid data layouts:
+     *   - Any
+     *
+     * @param[in, out] sketch Workload sketch into which the operator will be fused.
+     * @param[in, out] src    Source tensor info.
+     * @param[in, out] dst    Destination tensor info.
+     *                        If an uninitialized ITensorInfo is passed in, it will be auto-initialized.
+     */
+    static void create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst);
+
+    /** Check if the operator configuration is supported, irrespective of fusion.
+     *
+     * @param[in] context Workload context within which the operator is running.
+     * @param[in] src     Source tensor info.
+     * @param[in] dst     Destination tensor info.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst);
+
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuOutput::create_op().
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUOUTPUT */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
new file mode 100644
index 0000000000..a1b0c334d1
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadSketch;
+class GpuWorkloadContext;
+
+/** Operator backend specific settings
+*/
+class GpuPool2dSettings
+{
+public:
+    /* Get using -infinity as limit flag */
+    bool use_inf_as_limit() const;
+
+    /* Set using -infinity as limit flag */
+    GpuPool2dSettings use_inf_as_limit(bool use_inf_as_limit);
+
+private:
+    bool _use_inf_as_limit{true};
+};
+
+/** Operator interface. */
+class GpuPool2d final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = Pool2dAttributes;
+    /** Settings are a set of backend-specific parameters that influence the implementation of a operator */
+    using Settings = GpuPool2dSettings;
+
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor
+     * @param[in]     attributes Operator attributes
+     * @param[in]     settings   Operator settings
+     */
+    static ITensorInfo *
+    create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes, const Settings &settings);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Left hand side tensor info. Data types supported: F16/F32.
+     * @param[in] attributes Operator attributes
+     * @param[in] settings   Operator settings
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const Attributes         &attributes,
+                                  const Settings           &settings);
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     * Similar to @ref GpuPool2d::create_op()
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const Attributes        &attributes,
+                              const Settings          &settings);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUPOOL2D_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
new file mode 100644
index 0000000000..dc194fcadb
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESHAPE
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESHAPE
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ReshapeAttributes.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuReshape final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = ReshapeAttributes;
+
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * - Any
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Input tensor info. Data type supported: All
+     * @param[in]     attributes Operator attributes
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Input tensor info.
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+     *
+     *  Parameters are similar to @ref GpuReshape::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESHAPE */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
new file mode 100644
index 0000000000..e02f9dafea
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuResize.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2022-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE_H
+
+#include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuResize final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = ResizeAttributes;
+
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Left hand side tensor info.
+     * @param[in]     attributes Operator attributes
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Left hand side tensor info.
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status
+    is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes);
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuResize::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPURESIZE_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
new file mode 100644
index 0000000000..798b84b906
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSIGMOID
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSIGMOID
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuSigmoid final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @param[in, out] sketch Workload sketch into which the operator will be fused
+     * @param[in]      src    Source tensor info. Data types supported: F16/F32.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] src     Source tensor info. Data types supported: F16/F32.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src);
+
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuSigmoid::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSIGMOID
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
new file mode 100644
index 0000000000..66c2d77310
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSOFTMAX
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSOFTMAX
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuSoftmax final
+{
+public:
+    /** Attributes are a set of backend-agnostic parameters that define what an operator does */
+    using Attributes = SoftmaxAttributes;
+
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @param[in,out] sketch     Workload sketch into which the operator will be fused
+     * @param[in]     src        Source tensor info.
+     * @param[in]     dst        Destination tensor info.
+     * @param[in]     attributes Operator attributes
+     */
+    static void create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes);
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context    Workload context within which the operator is running
+     * @param[in] src        Source tensor info.
+     * @param[in] dst        Destination tensor info.
+     * @param[in] attributes Operator attributes
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context,
+                                  const ITensorInfo        *src,
+                                  const ITensorInfo        *dst,
+                                  const Attributes         &attributes);
+    /** Validate the operator and check if the its configuration is supported and if it can be fused into the workload sketch.
+     *  Similar to @ref GpuSoftmax::create_op()
+     *
+     * @return a status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch,
+                              const ITensorInfo       *src,
+                              const ITensorInfo       *dst,
+                              const Attributes        &attributes);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSOFTMAX */
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
new file mode 100644
index 0000000000..8c44ec5750
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB_H
+#define ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB_H
+
+#include "arm_compute/core/Error.h"
+
+namespace arm_compute
+{
+/** Forward declaration */
+class ITensorInfo;
+
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuSub final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst           |
+     * |:--------------|:--------------|:-------------|
+     * |F16            |F16            |F16           |
+     * |F32            |F32            |F32           |
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * @param[in,out] sketch Workload sketch into which the operator will be fused
+     * @param[in]     lhs    Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in]     rhs    Right hand side tensor info. Same as @p lhs.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] lhs     Left hand side tensor info. Data types supported: U8/S16/S32/F16/F32.
+     * @param[in] rhs     Right hand side tensor info. Same as @p lhs.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs);
+
+    /** Validate the operator and check if its configuration is supported and if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuSub::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *rhs, const ITensorInfo *lhs);
+};
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUSUB_H
diff --git a/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
new file mode 100644
index 0000000000..9c0ce6de02
--- /dev/null
+++ b/arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUTANH
+#define ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUTANH
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+namespace dynamic_fusion
+{
+/** Forward declaration */
+class GpuWorkloadContext;
+class GpuWorkloadSketch;
+
+/** Operator interface. */
+class GpuTanh final
+{
+public:
+    /** Create an operator and fuse it into the workload sketch.
+     *    @note If @ref validate_op() fails, the creation also fails and may throw an error.
+     *    @note If @ref validate_op() fails, @p sketch remains unchanged and valid.
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @param[in, out] sketch Workload sketch into which the operator will be fused
+     * @param[in]      src    Source tensor info. Data types supported: F16/F32.
+     *
+     * @return Pointer for the destination tensor info
+     */
+    static ITensorInfo *create_op(GpuWorkloadSketch &sketch, ITensorInfo *src);
+
+    /** Check if the operator configuration is supported, irrespective of fusion
+     *
+     * @param[in] context Workload context within which the operator is running
+     * @param[in] src     Source tensor info. Data types supported: F16/F32.
+     *
+     * @return Status
+     */
+    static Status is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src);
+
+    /** Validate the operator and check if it can be fused into the workload sketch.
+     *
+     * Parameters are similar to @ref GpuTanh::create_op()
+     *
+     * @return Status
+     */
+    static Status validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src);
+};
+
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
+
+#endif //ARM_COMPUTE_DYNAMIC_FUSION_SKETCH_GPU_OPERATORS_GPUTANH
diff --git a/arm_compute/function_info/ActivationLayerInfo.h b/arm_compute/function_info/ActivationLayerInfo.h
new file mode 100644
index 0000000000..9390d0c54f
--- /dev/null
+++ b/arm_compute/function_info/ActivationLayerInfo.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/QuantizationInfo.h"
+
+#include <array>
+#include <memory>
+
+#ifdef __aarch64__
+#include <arm_neon.h>
+#endif // __arch64__
+
+namespace arm_compute
+{
+/** Available activation functions */
+enum class ActivationFunction
+{
+    LOGISTIC,        /**< Logistic ( \f$ f(x) = \frac{1}{1 + e^{-x}} \f$ ) */
+    TANH,            /**< Hyperbolic tangent ( \f$ f(x) = a \cdot tanh(b \cdot x) \f$ ) */
+    RELU,            /**< Rectifier ( \f$ f(x) = max(0,x) \f$ ) */
+    BOUNDED_RELU,    /**< Upper Bounded Rectifier ( \f$ f(x) = min(a, max(0,x)) \f$ ) */
+    LU_BOUNDED_RELU, /**< Lower and Upper Bounded Rectifier ( \f$ f(x) = min(a, max(b,x)) \f$ ) */
+    LEAKY_RELU, /**< Leaky Rectifier ( \f$ f(x) = \begin{cases}  \alpha x & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
+    SOFT_RELU,  /**< Soft Rectifier ( \f$ f(x)= log(1+e^x) \f$ ) */
+    ELU, /**< Exponential Linear Unit ( \f$ f(x) = \begin{cases}  \alpha (exp(x) - 1) & \quad \text{if } x \text{ < 0}\\  x & \quad \text{if } x \geq \text{ 0 } \end{cases} \f$ ) */
+    ABS, /**< Absolute ( \f$ f(x)= |x| \f$ ) */
+    SQUARE,     /**< Square ( \f$ f(x)= x^2 \f$ )*/
+    SQRT,       /**< Square root ( \f$ f(x) = \sqrt{x} \f$ )*/
+    LINEAR,     /**< Linear ( \f$ f(x)= ax + b \f$ ) */
+    IDENTITY,   /**< Identity ( \f$ f(x)= x \f$ ) */
+    HARD_SWISH, /**< Hard-swish ( \f$ f(x) = (x \text{ReLU6}(x+3))/6 = x \min(\max(0,x+3),6)/6 \f$ ) */
+    SWISH,      /**< Swish ( \f$ f(x) = \frac{x}{1 + e^{-ax}} = x \text{logistic}(ax) \f$ ) */
+    GELU        /**< GELU ( \f$ f(x) = x * 1/2 * 1 + erf(x / \sqrt{2}) \f$ ) */
+};
+/** Activation Layer Information class */
+class ActivationLayerInfo
+{
+public:
+    typedef arm_compute::ActivationFunction ActivationFunction;
+
+    /** Lookup table  */
+#ifdef __aarch64__
+    using LookupTable256   = std::array<qasymm8_t, 256>;
+    using LookupTable65536 = std::array<float16_t, 65536>;
+#endif // __aarch64__
+
+    ActivationLayerInfo() = default;
+    /** Default Constructor
+     *
+     * @param[in] f The activation function to use.
+     * @param[in] a (Optional) The alpha parameter used by some activation functions
+     *              (@ref ActivationFunction::BOUNDED_RELU, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::LINEAR, @ref ActivationFunction::TANH).
+     * @param[in] b (Optional) The beta parameter used by some activation functions (@ref ActivationFunction::LINEAR, @ref ActivationFunction::LU_BOUNDED_RELU, @ref ActivationFunction::TANH).
+     */
+    ActivationLayerInfo(ActivationFunction f, float a = 0.0f, float b = 0.0f) : _act(f), _a(a), _b(b), _enabled(true)
+    {
+    }
+    /** Get the type of activation function */
+    ActivationFunction activation() const
+    {
+        return _act;
+    }
+    /** Get the alpha value */
+    float a() const
+    {
+        return _a;
+    }
+    /** Get the beta value */
+    float b() const
+    {
+        return _b;
+    }
+    /** Check if initialised */
+    bool enabled() const
+    {
+        return _enabled;
+    }
+
+#ifdef __aarch64__
+    const LookupTable256 &lut() const
+    {
+        return _lut;
+    }
+    void setLookupTable256(LookupTable256 &lut)
+    {
+        _lut = std::move(lut);
+    }
+
+    const LookupTable65536 &lut_fp16() const
+    {
+        ARM_COMPUTE_ERROR_ON(_lut_fp16 == nullptr);
+        return *_lut_fp16;
+    }
+    void setLookupTable65536(std::shared_ptr<LookupTable65536> lut)
+    {
+        _lut_fp16 = lut;
+    }
+#endif // __aarch64__
+private:
+    ActivationFunction _act     = {ActivationLayerInfo::ActivationFunction::IDENTITY};
+    float              _a       = {};
+    float              _b       = {};
+    bool               _enabled = {false};
+
+#ifdef __aarch64__
+    LookupTable256                    _lut = {};
+    std::shared_ptr<LookupTable65536> _lut_fp16{nullptr};
+#endif // __aarch64__
+};
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_ACTIVATIONLAYERINFO_H
diff --git a/arm_compute/function_info/ConvolutionInfo.h b/arm_compute/function_info/ConvolutionInfo.h
new file mode 100644
index 0000000000..4830cae137
--- /dev/null
+++ b/arm_compute/function_info/ConvolutionInfo.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_CONVOLUTIONINFO
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_CONVOLUTIONINFO
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+namespace arm_compute
+{
+struct ConvolutionInfo
+{
+    ConvolutionInfo() = default;
+    ConvolutionInfo(const PadStrideInfo       &pad_stride_info,
+                    unsigned int               depth_multiplier,
+                    const ActivationLayerInfo &act_info,
+                    const Size2D              &dilation)
+        : pad_stride_info(pad_stride_info), depth_multiplier(depth_multiplier), act_info(act_info), dilation(dilation)
+    {
+    }
+    PadStrideInfo pad_stride_info{}; /**< Convolution info (Pads, strides,...) */
+    unsigned int  depth_multiplier{
+        1}; /**< Multiplier to apply to input's depth to retrieve the output depth. Defaults to 1 */
+    ActivationLayerInfo act_info{};             /**< Fused activation to apply after convolution. */
+    Size2D              dilation{Size2D(1, 1)}; /**< Dilation, in elements, across x and y. Defaults to (1, 1). */
+};
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_CONVOLUTIONINFO */
diff --git a/arm_compute/function_info/FullyConnectedLayerInfo.h b/arm_compute/function_info/FullyConnectedLayerInfo.h
new file mode 100644
index 0000000000..e65daeb2d4
--- /dev/null
+++ b/arm_compute/function_info/FullyConnectedLayerInfo.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_FULLYCONNECTEDLAYERINFO
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_FULLYCONNECTEDLAYERINFO
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+namespace arm_compute
+{
+/** Fully connected layer info */
+struct FullyConnectedLayerInfo
+{
+    /* Fused-activation parameters */
+    ActivationLayerInfo activation_info{}; /**<  Fused activation to apply after the matrix multiplication. */
+    /* Information about weights */
+    DataLayout weights_trained_layout{DataLayout::NCHW}; /**<  Layout that the weights have been trained with. */
+    bool       transpose_weights{true};                  /**<  Transpose weights if true. */
+    bool       are_weights_reshaped{false};              /**<  @deprecated Reshape the weights tensor if false. */
+    bool       retain_internal_weights{false};           /**<  Retain internal reshaped weights. */
+    bool       enable_fast_math{false};                  /**<  Enable fast math computation. */
+    /* Other parameters */
+    bool fp_mixed_precision{false}; /**<  Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy. */
+
+    /** Sets the weights trained data layout
+     *
+     * @param[in] layout Data layout that the weights were trained with
+     *
+     * @return Updated object
+     */
+    FullyConnectedLayerInfo &set_weights_trained_layout(DataLayout layout)
+    {
+        weights_trained_layout = layout;
+        return *this;
+    }
+    /** Sets the transpose weights flag
+     *
+     * @param[in] should_transpose_weights Boolean flag indicating if weights should be transposed
+     *
+     * @return Updated object
+     */
+    FullyConnectedLayerInfo &set_transpose_weights(bool should_transpose_weights)
+    {
+        transpose_weights = should_transpose_weights;
+        return *this;
+    }
+};
+
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_FULLYCONNECTEDLAYERINFO */
diff --git a/arm_compute/function_info/GEMMInfo.h b/arm_compute/function_info/GEMMInfo.h
new file mode 100644
index 0000000000..74fe30454e
--- /dev/null
+++ b/arm_compute/function_info/GEMMInfo.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
+
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+class ITensorInfo;
+/** GEMMLowp output stage type */
+enum class GEMMLowpOutputStageType
+{
+    NONE,                     /**< No quantization */
+    QUANTIZE_DOWN,            /**< Quantize using an integer multiplication */
+    QUANTIZE_DOWN_FIXEDPOINT, /**< Quantize using a fixed point multiplication */
+    QUANTIZE_DOWN_FLOAT       /**< Quantize using a floating point multiplication */
+};
+
+/** GEMMLowp output stage info */
+struct GEMMLowpOutputStageInfo
+{
+    GEMMLowpOutputStageType type{GEMMLowpOutputStageType::NONE}; /**< GEMMLowp output stage type */
+    int32_t                 gemmlowp_offset{0}; /**< GEMMLowp output stage offset used for quantizing to QASYMM8 */
+    int32_t gemmlowp_multiplier{0};             /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    int32_t gemmlowp_shift{0};                  /**< GEMMLowp output stage shift used for quantizing to uint8 */
+    int32_t gemmlowp_min_bound{
+        std::numeric_limits<int32_t>::
+            lowest()}; /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
+    int32_t gemmlowp_max_bound{
+        std::numeric_limits<int32_t>::
+            max()}; /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
+    std::vector<int32_t> gemmlowp_multipliers{}; /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    std::vector<int32_t> gemmlowp_shifts{};      /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    float    gemmlowp_real_multiplier{0}; /**< GEMMLowp output stage real multiplier used for quantizing to QASYMM8 */
+    bool     is_quantized_per_channel{false}; /**< GEMMLowp quantized per-channel flag */
+    DataType output_data_type{
+        DataType::UNKNOWN}; /**< Output tensor data type to use if the output is not initialized */
+};
+/** GEMM information class. This class stores the necessary information to compute GEMM functions
+ *
+ * This object also contains the information about how matrix A and matrix B have been reshaped
+ *
+ */
+class GEMMInfo
+{
+public:
+    /** Default constructor */
+    GEMMInfo() noexcept
+        : _is_a_reshaped(false),
+          _is_b_reshaped(false),
+          _reshape_b_only_on_first_run(true),
+          _depth_output_gemm3d(0),
+          _reinterpret_input_as_3d(false),
+          _retain_internal_weights(false),
+          _gemmlowp_output_stage(),
+          _fast_math(false),
+          _fp_mixed_precision(false),
+          _broadcast_bias(false),
+          _pretranspose_A(false),
+          _pretranspose_B(false),
+          _activation_info(),
+          _fixed_format(false),
+          _weight_format(arm_compute::WeightFormat::UNSPECIFIED),
+          _accumulate(false)
+    {
+    }
+    /** Constructor
+     *
+     * @param[in] is_a_reshaped               True if the matrix A has been reshaped
+     * @param[in] is_b_reshaped               True if the matrix B has been reshaped
+     * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
+     * @param[in] depth_output_gemm3d         (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
+     *                                        If 0 the output will not be reinterpreted as 3D. Default 0
+     * @param[in] reinterpret_input_as_3d     (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
+     *                                        to perform 1x1 convolutions with the NHWC data layout)
+     * @param[in] retain_internal_weights     (Optional) Retain the weights tensor from previous run
+     * @param[in] gemmlowp_output_stage       (Optional) GEMMLowp Output stage info
+     * @param[in] fp_mixed_precision          (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy.
+     * @param[in] fast_math                   (Optional) Use a data type of shorter width to improve performance
+     * @param[in] broadcast_bias              (Optional) Broadcast the shape of the bias tensor from a vector to a matrix.
+     * @param[in] activation_info             (Optional) Activation to apply after the matrix multiplication
+     * @param[in] fixed_format                (Optional) Specify the selection of fixed format kernels for variable weights support in GEMM. These kernels expect the weights tensor to be in amemory format that is fixed by the kernel itself. For more information, see arm_compute::WeightFormat.
+     * @param[in] weight_format               (Optional) arm_gemm:WeightFormat enumeration requested by the user. Default is arm_compute::WeightFormat::UNSPECIFIED.
+     * @param[in] pretranspose_B              (Optional) Pretranspose matrix B (transposition of its lowest 2 dimensions), in addition to and before, any further transformations of B
+     * @param[in] accumulate                  (Optional) Whether to accumulate in destination or not
+     */
+    GEMMInfo(bool                       is_a_reshaped,
+             bool                       is_b_reshaped,
+             bool                       reshape_b_only_on_first_run,
+             int                        depth_output_gemm3d     = 0,
+             bool                       reinterpret_input_as_3d = false,
+             bool                       retain_internal_weights = false,
+             GEMMLowpOutputStageInfo    gemmlowp_output_stage   = GEMMLowpOutputStageInfo(),
+             bool                       fp_mixed_precision      = false,
+             bool                       fast_math               = false,
+             bool                       broadcast_bias          = false,
+             const ActivationLayerInfo &activation_info         = ActivationLayerInfo(),
+             bool                       fixed_format            = false,
+             arm_compute::WeightFormat  weight_format           = arm_compute::WeightFormat::UNSPECIFIED,
+             bool                       pretranspose_B          = false,
+             bool                       accumulate              = false) noexcept
+        : _is_a_reshaped(is_a_reshaped),
+          _is_b_reshaped(is_b_reshaped),
+          _reshape_b_only_on_first_run(reshape_b_only_on_first_run),
+          _depth_output_gemm3d(depth_output_gemm3d),
+          _reinterpret_input_as_3d(reinterpret_input_as_3d),
+          _retain_internal_weights(retain_internal_weights),
+          _gemmlowp_output_stage(gemmlowp_output_stage),
+          _fast_math(fast_math),
+          _fp_mixed_precision(fp_mixed_precision),
+          _broadcast_bias(broadcast_bias),
+          _pretranspose_A(false),
+          _pretranspose_B(pretranspose_B),
+          _activation_info(activation_info),
+          _fixed_format(fixed_format),
+          _weight_format(weight_format),
+          _accumulate(accumulate)
+    {
+    }
+    /** Flag which specifies if the matrix A has been reshaped
+     *
+     * @return True if the matrix A has been reshaped
+     */
+    bool is_a_reshaped() const
+    {
+        return _is_a_reshaped;
+    };
+    /** Flag which specifies if the matrix B has been reshaped
+     *
+     * @return True if the matrix B has been reshaped
+     */
+    bool is_b_reshaped() const
+    {
+        return _is_b_reshaped;
+    };
+    /** Flag which specifies if the reshape of matrix B should executed only for the first
+     *
+     * @note This flag could be set to TRUE when GEMM is used to accelerate convolution layer
+     *
+     * @return True if the reshaped of matrix B happens only for the first run
+     */
+    bool reshape_b_only_on_first_run() const
+    {
+        return _reshape_b_only_on_first_run;
+    };
+    /** Depth of the output when GEMM output is reinterpreted as 3D tensor
+     *
+     * @return the depth of the output tensor
+     */
+    int depth_output_gemm3d() const
+    {
+        return _depth_output_gemm3d;
+    };
+    /** Flag which specifies if the input tensor has to be reinterpreted as 3D
+     *
+     * @return True if the input tensor has to be reinterpreted as 3D tensor
+     */
+    bool reinterpret_input_as_3d() const
+    {
+        return _reinterpret_input_as_3d;
+    };
+    /** Flag which specifies if the weights tensor has to be retained from previous run
+     *
+     * @return True if the weights tensor has to be retained
+     */
+    bool retain_internal_weights() const
+    {
+        return _retain_internal_weights;
+    };
+    /** GEMMLowp output stage
+     *
+     * @return the GEMMLowp output stage info
+     */
+    GEMMLowpOutputStageInfo gemmlowp_output_stage() const
+    {
+        return _gemmlowp_output_stage;
+    };
+    /** Sets GEMMLowp output stage
+     *
+     * @param[in] output_stage Output stage to set
+     */
+    void set_gemmlowp_output_stage(GEMMLowpOutputStageInfo &output_stage)
+    {
+        _gemmlowp_output_stage = output_stage;
+    };
+    /** Flag which specifies if a wider accumulator should be used.
+     *
+     * @return True if a wider accumulator has to be used
+     */
+    bool fp_mixed_precision() const
+    {
+        return _fp_mixed_precision;
+    };
+    /** Flag which specifies if a shorter accumulator to be used.
+     *
+     * @return True if a shorter accumulator has to be used
+     */
+    bool fast_math() const
+    {
+        return _fast_math;
+    };
+    /** Set fast math flag
+     *
+     * @param[in] fast_math Flag to set
+     */
+    void set_fast_math(bool fast_math)
+    {
+        _fast_math = fast_math;
+    }
+    /** Flag which specifies whether to broadcast the shape of the bias tensor.
+     *
+     * @return True if the shape of the bias tensor is to be broadcasted.
+     */
+    bool broadcast_bias() const
+    {
+        return _broadcast_bias;
+    };
+    /** Flag which specifies whether A should be pre-transposed if supported.
+     *
+     * @return True if A should be pre-transposed else false.
+     */
+    bool pretranspose_A() const
+    {
+        return _pretranspose_A;
+    };
+    /** Set pre-transpose A flag
+     *
+     * @param[in] flag Flag to set
+     */
+    void set_pretranspose_A(bool flag)
+    {
+        _pretranspose_A = flag;
+    }
+    /** Flag which specifies whether b should be pre-transposed if supported.
+     * More concretely, the "pre-transpose" is the transposition of the b tensor's lowest 2 dimensions
+     * If specified true, this pre-transpose will occur in addition to and before, any further transformations of the b matrix
+     *
+     * @return True if b should be pre-transposed else false.
+     */
+    bool pretranspose_B() const
+    {
+        return _pretranspose_B;
+    };
+    /** Set pre-transpose b flag
+     *
+     * @param[in] flag Flag to set
+     */
+    void set_pretranspose_B(bool flag)
+    {
+        _pretranspose_B = flag;
+    }
+    /** Activation layer to apply after the matrix multiplication
+     *
+     * @return ActivationLayerInfo object
+     */
+    ActivationLayerInfo activation_info() const
+    {
+        return _activation_info;
+    }
+    /** Set activation layer info
+     *
+     * @param[in] activation_info ActivationLayerInfo object to set
+     */
+    void set_activation_info(const ActivationLayerInfo &activation_info)
+    {
+        _activation_info = activation_info;
+    }
+    /** Flag which specifies if the GEMM operation is running fixed-format kernels.
+     *
+     * @return True if the GEMM operation is running fixed-format kernel else false.
+     */
+    bool fixed_format() const
+    {
+        return _fixed_format;
+    }
+    /** Flag which specifies if GEMM should accumulate the result in destination or not.
+     *
+     * @return True if GEMM is accumulating the result.
+     */
+    bool accumulate() const
+    {
+        return _accumulate;
+    }
+    /** Set fixed-format flag
+     *
+     * @param[in] fixed_format sets whether or not to use fixed-format kernels
+     */
+    void set_fixed_format(bool fixed_format)
+    {
+        _fixed_format = fixed_format;
+    }
+    /** Set accumulate flag
+     *
+     * @param[in] accumulate sets whether or not to use accumulation
+     */
+    void set_accumulate(bool accumulate)
+    {
+        _accumulate = accumulate;
+    }
+
+    arm_compute::WeightFormat weight_format() const
+    {
+        return _weight_format;
+    }
+    /** Set weight format to be used
+     *
+     * @param[in] weight_format arm_compute::WeightFormat enumeration
+     */
+    void set_weight_format(arm_compute::WeightFormat weight_format)
+    {
+        _weight_format = weight_format;
+    }
+
+private:
+    bool                      _is_a_reshaped;
+    bool                      _is_b_reshaped;
+    bool                      _reshape_b_only_on_first_run;
+    int                       _depth_output_gemm3d;
+    bool                      _reinterpret_input_as_3d;
+    bool                      _retain_internal_weights;
+    GEMMLowpOutputStageInfo   _gemmlowp_output_stage;
+    bool                      _fast_math;
+    bool                      _fp_mixed_precision;
+    bool                      _broadcast_bias;
+    bool                      _pretranspose_A;
+    bool                      _pretranspose_B;
+    ActivationLayerInfo       _activation_info;
+    bool                      _fixed_format;
+    arm_compute::WeightFormat _weight_format;
+    bool                      _accumulate;
+};
+} //namespace arm_compute
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_GEMMINFO_H
diff --git a/arm_compute/core/utils/misc/CRTP.h b/arm_compute/function_info/MatMulInfo.h
index 037c69ab1d..fc73efb44a 100644
--- a/arm_compute/core/utils/misc/CRTP.h
+++ b/arm_compute/function_info/MatMulInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,35 +21,42 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_MISC_CRTP_H
-#define ARM_COMPUTE_MISC_CRTP_H
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_MATMULINFO
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_MATMULINFO
 
 namespace arm_compute
 {
-namespace misc
-{
-/** Curiously recurring template pattern Interface */
-template <typename T, template <typename> class Type>
-struct CRTP
+/** Class for holding information related to matrix multiplication function
+ */
+class MatMulInfo
 {
 public:
-    /** Exact type */
-    using ExactType = T;
-
-protected:
-    const T &impl() const
+    /* Get Adjoint LHS flag value */
+    bool adj_lhs() const
+    {
+        return _adj_lhs;
+    }
+    /* Get Adjoint RHS flag value */
+    bool adj_rhs() const
+    {
+        return _adj_rhs;
+    }
+    /* Set Adjoint LHS flag */
+    MatMulInfo &adj_lhs(bool adj_lhs)
     {
-        return static_cast<const T &>(*this);
+        _adj_lhs = adj_lhs;
+        return *this;
     }
-    T &impl()
+    /* Set Adjoint RHS flag */
+    MatMulInfo &adj_rhs(bool adj_rhs)
     {
-        return static_cast<T &>(*this);
+        _adj_rhs = adj_rhs;
+        return *this;
     }
 
 private:
-    CRTP() = default;
-    friend Type<T>;
+    bool _adj_lhs{false};
+    bool _adj_rhs{false};
 };
-} // namespace misc
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_MISC_CRTP_H */
+#endif /* ACL_ARM_COMPUTE_FUNCTION_INFO_MATMULINFO */
diff --git a/arm_compute/runtime/CL/tuners/Tuners.h b/arm_compute/function_info/ScatterInfo.h
index 05d5b68160..176a863ac5 100644
--- a/arm_compute/runtime/CL/tuners/Tuners.h
+++ b/arm_compute/function_info/ScatterInfo.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,36 +21,34 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_TUNERS_H
-#define ARM_COMPUTE_TUNERS_H
 
-#include "arm_compute/runtime/CL/tuners/BifrostTuner.h"
-#include "arm_compute/runtime/CL/tuners/MidgardTuner.h"
+#ifndef ACL_ARM_COMPUTE_FUNCTION_INFO_SCATTERINFO_H
+#define ACL_ARM_COMPUTE_FUNCTION_INFO_SCATTERINFO_H
 
-#include <memory>
+#include "arm_compute/core/Error.h"
 
 namespace arm_compute
 {
-namespace tuners
+/** Scatter Function */
+enum class ScatterFunction
 {
-/** Tuner factory class */
-class TunerFactory final
+    Update = 0,
+    Add    = 1,
+    Sub    = 2,
+    Max    = 3,
+    Min    = 4
+};
+/** Scatter operator information */
+struct ScatterInfo
 {
-public:
-    static std::unique_ptr<ICLTuner> create_tuner(GPUTarget target)
+    ScatterInfo(ScatterFunction f, bool zero) : func(f), zero_initialization(zero)
     {
-        GPUTarget arch = get_arch_from_target(target);
-        switch(arch)
-        {
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<BifrostTuner>();
-            case GPUTarget::MIDGARD:
-                return support::cpp14::make_unique<MidgardTuner>();
-            default:
-                return nullptr;
-        }
+        ARM_COMPUTE_ERROR_ON_MSG(f != ScatterFunction::Add && zero,
+                                 "Zero initialisation is only supported with Add Scatter Function.");
     }
+    ScatterFunction func;            /**< Type of scatter function to use with scatter operator*/
+    bool zero_initialization{false}; /**< Fill output tensors with 0. Only available with add scatter function. */
 };
-} // namespace tuners
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_TUNERS_H */
+
+#endif // ACL_ARM_COMPUTE_FUNCTION_INFO_SCATTERINFO_H
diff --git a/arm_compute/graph.h b/arm_compute/graph.h
index 37c5220ce1..6bb6f148a7 100644
--- a/arm_compute/graph.h
+++ b/arm_compute/graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/backends/GLES/GCFunctionFactory.h b/arm_compute/graph/DataLayerVisitor.h
index 289a3cb0f8..11d9f1ddc9 100644
--- a/arm_compute/graph/backends/GLES/GCFunctionFactory.h
+++ b/arm_compute/graph/DataLayerVisitor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,37 +21,41 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_GCFUNCTIONFACTORY_H
-#define ARM_COMPUTE_GRAPH_GCFUNCTIONFACTORY_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
+#include "arm_compute/graph/IGraphPrinter.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Types.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-// Forward declarations
-class INode;
-class GraphContext;
-
-namespace backends
-{
-/** Factory for generating GLES compute backend functions **/
-class GCFunctionFactory final
+/** Graph printer visitor. */
+class DataLayerVisitor final : public DefaultNodeVisitor
 {
 public:
-    /** Create a backend execution function depending on the node type
-     *
-     * @param[in] node Node to create the backend function for
-     * @param[in] ctx  Context to use
-     *
-     * @return Backend function
-     */
-    static std::unique_ptr<arm_compute::IFunction> create(INode *node, GraphContext &ctx);
+    using LayerData = std::map<std::string, std::string>;
+    /** Default Constructor **/
+    DataLayerVisitor() = default;
+
+    const LayerData &layer_data() const;
+
+    // Reveal Parent method
+    using DefaultNodeVisitor::visit;
+    // Inherited methods overridden
+    void visit(ConvolutionLayerNode &n) override;
+    void visit(DepthwiseConvolutionLayerNode &n) override;
+    void visit(FusedConvolutionBatchNormalizationNode &n) override;
+    void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
+    void visit(OutputNode &n) override;
+
+    void default_visit(INode &n) override;
+
+private:
+    LayerData _layer_data{};
 };
-} // namespace backends
 } // namespace graph
 } // namespace arm_compute
-#endif //ARM_COMPUTE_GRAPH_GCFUNCTIONFACTORY_H
+#endif // ACL_ARM_COMPUTE_GRAPH_DATALAYERVISITOR_H
diff --git a/arm_compute/graph/Edge.h b/arm_compute/graph/Edge.h
index e40914efd0..7f5075d885 100644
--- a/arm_compute/graph/Edge.h
+++ b/arm_compute/graph/Edge.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,8 +48,18 @@ public:
      * @param[in] consumer_idx Consumer node input index
      * @param[in] tensor       Tensor associated with the edge
      */
-    Edge(EdgeID id, INode *producer, unsigned int producer_idx, INode *consumer, unsigned int consumer_idx, Tensor *tensor)
-        : _id(id), _producer(producer), _consumer(consumer), _producer_idx(producer_idx), _consumer_idx(consumer_idx), _tensor(tensor)
+    Edge(EdgeID       id,
+         INode       *producer,
+         unsigned int producer_idx,
+         INode       *consumer,
+         unsigned int consumer_idx,
+         Tensor      *tensor)
+        : _id(id),
+          _producer(producer),
+          _consumer(consumer),
+          _producer_idx(producer_idx),
+          _consumer_idx(consumer_idx),
+          _tensor(tensor)
 
     {
     }
diff --git a/arm_compute/graph/Graph.h b/arm_compute/graph/Graph.h
index dce92c67f1..e6e173f5fa 100644
--- a/arm_compute/graph/Graph.h
+++ b/arm_compute/graph/Graph.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "arm_compute/graph/Types.h"
 
 #include "support/Mutex.h"
-#include "support/ToolchainSupport.h"
 
 #include <map>
 #include <memory>
@@ -80,7 +79,7 @@ public:
      * @return ID of the node
      */
     template <typename NT, typename... Ts>
-    NodeID add_node(Ts &&... args);
+    NodeID add_node(Ts &&...args);
     /** Remove the node with the given ID
      *
      * @param[in] nid ID of the node to remove
@@ -222,23 +221,23 @@ private:
     TensorID create_tensor(const TensorDescriptor &desc = TensorDescriptor());
 
 private:
-    GraphID                              _id      = GraphID(0); /**< Graph id */
-    std::string                          _name    = {};         /**< Graph name */
-    std::vector<std::unique_ptr<INode>>  _nodes   = {};         /**< Graph nodes */
-    std::vector<std::unique_ptr<Edge>>   _edges   = {};         /**< Graph edges */
-    std::vector<std::unique_ptr<Tensor>> _tensors = {};         /**< Graph tensors */
+    GraphID                                 _id           = GraphID(0); /**< Graph id */
+    std::string                             _name         = {};         /**< Graph name */
+    std::vector<std::unique_ptr<INode>>     _nodes        = {};         /**< Graph nodes */
+    std::vector<std::unique_ptr<Edge>>      _edges        = {};         /**< Graph edges */
+    std::vector<std::unique_ptr<Tensor>>    _tensors      = {};         /**< Graph tensors */
     std::map<NodeType, std::vector<NodeID>> _tagged_nodes = {}; /**< Graph nodes map with the node type as key */
-    arm_compute::Mutex _mtx = {};                               /**< Mutex used for graph construction */
+    arm_compute::Mutex                      _mtx          = {}; /**< Mutex used for graph construction */
 };
 
 template <typename NT, typename... Ts>
-inline NodeID Graph::add_node(Ts &&... args)
+inline NodeID Graph::add_node(Ts &&...args)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
 
     // Create node
     NodeID nid  = _nodes.size();
-    auto   node = support::cpp14::make_unique<NT>(std::forward<Ts>(args)...);
+    auto   node = std::make_unique<NT>(std::forward<Ts>(args)...);
     node->set_graph(this);
     node->set_id(nid);
 
@@ -246,7 +245,7 @@ inline NodeID Graph::add_node(Ts &&... args)
     _tagged_nodes[node->type()].push_back(nid);
 
     // Associate a new tensor with each output
-    for(auto &output : node->_outputs)
+    for (auto &output : node->_outputs)
     {
         output = create_tensor();
     }
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index 612703cae5..118d06bdda 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an input layer node to the graph
      *
      * @param[in] g        Graph to add the node to
@@ -61,7 +62,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an output layer node to the graph
      *
      * @param[in] g        Graph to add the node to
@@ -71,7 +73,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor = nullptr);
+    static NodeID
+    add_output_node(Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr accessor = nullptr);
     /** Adds an activation layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -82,8 +85,30 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+    static NodeID add_activation_node(Graph                  &g,
+                                      NodeParams              params,
+                                      NodeIdxPair             input,
+                                      ActivationLayerInfo     act_info,
                                       const QuantizationInfo &out_quant_info = QuantizationInfo());
+    /** Adds an activation layer node to the graph
+     *
+     * @param[in] g              Graph to add the node to
+     * @param[in] params         Common node parameters
+     * @param[in] input          Input to the activation layer node as a NodeID-Index pair
+     * @param[in] op             Reduction Operation: min or max
+     * @param[in] axis           Axis to perform reduction operation across
+     * @param[in] out_data_type  (Optional) Output data type
+     * @param[in] out_quant_info (Optional) Output quantization info
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_arg_min_max_node(Graph                  &g,
+                                       NodeParams              params,
+                                       NodeIdxPair             input,
+                                       ReductionOperation      op,
+                                       unsigned int            axis,
+                                       DataType                out_data_type  = DataType::UNKNOWN,
+                                       const QuantizationInfo &out_quant_info = QuantizationInfo());
     /** Adds a batch normalization layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -97,9 +122,14 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
-                                               ITensorAccessorUPtr mean_accessor = nullptr, ITensorAccessorUPtr var_accessor = nullptr,
-                                               ITensorAccessorUPtr beta_accessor = nullptr, ITensorAccessorUPtr gamma_accessor = nullptr);
+    static NodeID add_batch_normalization_node(Graph              &g,
+                                               NodeParams          params,
+                                               NodeIdxPair         input,
+                                               float               epsilon,
+                                               ITensorAccessorUPtr mean_accessor  = nullptr,
+                                               ITensorAccessorUPtr var_accessor   = nullptr,
+                                               ITensorAccessorUPtr beta_accessor  = nullptr,
+                                               ITensorAccessorUPtr gamma_accessor = nullptr);
     /** Adds a bounding box transform layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -110,7 +140,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info);
+    static NodeID add_bounding_box_transform_node(
+        Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info);
     /** Adds an channel shuffle layer node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -123,8 +154,6 @@ public:
     static NodeID add_channel_shuffle_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_groups);
     /** Adds a convolution layer node to the graph
      *
-     * TODO (COMPMID-1113): Add a graph descriptor for convolution layer node
-     *
      * @param[in] g                     Graph to add the node to
      * @param[in] params                Common node parameters
      * @param[in] input                 Input to the convolution layer node as a NodeID-Index pair
@@ -141,10 +170,17 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                       Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info, unsigned int num_groups = 1,
-                                       ConvolutionMethod method = ConvolutionMethod::Default, FastMathHint fast_math_hint = FastMathHint::Disabled,
-                                       ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
+    static NodeID add_convolution_node(Graph                  &g,
+                                       NodeParams              params,
+                                       NodeIdxPair             input,
+                                       Size2D                  kernel_spatial_extend,
+                                       unsigned int            depth,
+                                       PadStrideInfo           conv_info,
+                                       unsigned int            num_groups         = 1,
+                                       ConvolutionMethod       method             = ConvolutionMethod::Default,
+                                       FastMathHint            fast_math_hint     = FastMathHint::Disabled,
+                                       ITensorAccessorUPtr     weights_accessor   = nullptr,
+                                       ITensorAccessorUPtr     bias_accessor      = nullptr,
                                        const QuantizationInfo &weights_quant_info = QuantizationInfo(),
                                        const QuantizationInfo &out_quant_info     = QuantizationInfo());
     /** Adds a deconvolution layer node to the graph
@@ -160,9 +196,14 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                         Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info,
-                                         ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr);
+    static NodeID add_deconvolution_node(Graph              &g,
+                                         NodeParams          params,
+                                         NodeIdxPair         input,
+                                         Size2D              kernel_spatial_extend,
+                                         unsigned int        depth,
+                                         PadStrideInfo       deconv_info,
+                                         ITensorAccessorUPtr weights_accessor = nullptr,
+                                         ITensorAccessorUPtr bias_accessor    = nullptr);
     /** Adds a depth concatenate node to the graph
      *
      * @param[in] g                 Graph to add the node to
@@ -172,7 +213,20 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor);
+    static NodeID add_concatenate_node(Graph                                    &g,
+                                       NodeParams                                params,
+                                       const std::vector<NodeIdxPair>           &inputs,
+                                       const descriptors::ConcatLayerDescriptor &concat_descriptor);
+    /** Adds an depth to space layer node to the graph
+     *
+     * @param[in] g           Graph to add the node to
+     * @param[in] params      Common node parameters
+     * @param[in] input       Input to the depth to space layer node as a NodeID-Index pair
+     * @param[in] block_shape Block shape to reshape tensor with
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_depth_to_space_node(Graph &g, NodeParams params, NodeIdxPair input, int32_t block_shape);
     /** Adds a depth-wise convolution layer node to the graph
      *
      * @param[in] g                     Graph to add the node to
@@ -189,11 +243,18 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                 Size2D kernel_spatial_extend, PadStrideInfo conv_info, int depth_multiplier = 1,
-                                                 DepthwiseConvolutionMethod method    = DepthwiseConvolutionMethod::Default,
-                                                 ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr, const QuantizationInfo &quant_info = QuantizationInfo(),
-                                                 const QuantizationInfo &out_quant_info = QuantizationInfo());
+    static NodeID
+    add_depthwise_convolution_node(Graph                     &g,
+                                   NodeParams                 params,
+                                   NodeIdxPair                input,
+                                   Size2D                     kernel_spatial_extend,
+                                   PadStrideInfo              conv_info,
+                                   int                        depth_multiplier = 1,
+                                   DepthwiseConvolutionMethod method           = DepthwiseConvolutionMethod::Default,
+                                   ITensorAccessorUPtr        weights_accessor = nullptr,
+                                   ITensorAccessorUPtr        bias_accessor    = nullptr,
+                                   const QuantizationInfo    &quant_info       = QuantizationInfo(),
+                                   const QuantizationInfo    &out_quant_info   = QuantizationInfo());
     /** Adds an element-wise layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -204,7 +265,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
+    static NodeID add_elementwise_node(
+        Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
     /** Adds a dequantization node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -225,7 +287,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info);
+    static NodeID add_detection_output_node(Graph                          &g,
+                                            NodeParams                      params,
+                                            NodeIdxPair                     input_loc,
+                                            NodeIdxPair                     input_conf,
+                                            NodeIdxPair                     input_priorbox,
+                                            const DetectionOutputLayerInfo &detect_info);
     /** Adds a detection post process layer node to the graph
      *
      * @param[in] g                      Graph to add the node to
@@ -238,8 +305,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction,
-                                                  const DetectionPostProcessLayerInfo &detect_info, ITensorAccessorUPtr anchors_accessor = nullptr,
+    static NodeID add_detection_post_process_node(Graph                               &g,
+                                                  NodeParams                           params,
+                                                  NodeIdxPair                          input_box_encoding,
+                                                  NodeIdxPair                          input_class_prediction,
+                                                  const DetectionPostProcessLayerInfo &detect_info,
+                                                  ITensorAccessorUPtr                  anchors_accessor = nullptr,
                                                   const QuantizationInfo &anchor_quant_info = QuantizationInfo());
     /** Adds a Dummy node to the graph
      *
@@ -272,13 +343,19 @@ public:
      * @param[in] bias_nid       (Optional) Node ID of the bias node data. Defaults to EmptyNodeID
      * @param[in] fc_info        (Optional) Fully connected layer metadata
      * @param[in] out_quant_info (Optional) Output quantization info
+     * @param[in] fast_math_hint (Optional) Fast math hint
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                            NodeID weights_nid, NodeID bias_nid = EmptyNodeID,
+    static NodeID add_fully_connected_layer(Graph                        &g,
+                                            NodeParams                    params,
+                                            NodeIdxPair                   input,
+                                            unsigned int                  num_outputs,
+                                            NodeID                        weights_nid,
+                                            NodeID                        bias_nid       = EmptyNodeID,
                                             const FullyConnectedLayerInfo fc_info        = FullyConnectedLayerInfo(),
-                                            const QuantizationInfo       &out_quant_info = QuantizationInfo());
+                                            const QuantizationInfo       &out_quant_info = QuantizationInfo(),
+                                            FastMathHint                  fast_math_hint = FastMathHint::Disabled);
     /** Adds a fully connected layer node to the graph
      *
      * @param[in] g                  Graph to add the layer to
@@ -290,14 +367,20 @@ public:
      * @param[in] fc_info            (Optional) Fully connected layer metadata
      * @param[in] weights_quant_info (Optional) Weights quantization info
      * @param[in] out_quant_info     (Optional) Output quantization info
+     * @param[in] fast_math_hint     (Optional) Fast math hint
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                            ITensorAccessorUPtr weights_accessor = nullptr, ITensorAccessorUPtr bias_accessor = nullptr,
-                                            const FullyConnectedLayerInfo fc_info            = FullyConnectedLayerInfo(),
+    static NodeID add_fully_connected_layer(Graph                        &g,
+                                            NodeParams                    params,
+                                            NodeIdxPair                   input,
+                                            unsigned int                  num_outputs,
+                                            ITensorAccessorUPtr           weights_accessor = nullptr,
+                                            ITensorAccessorUPtr           bias_accessor    = nullptr,
+                                            const FullyConnectedLayerInfo fc_info          = FullyConnectedLayerInfo(),
                                             const QuantizationInfo       &weights_quant_info = QuantizationInfo(),
-                                            const QuantizationInfo       &out_quant_info     = QuantizationInfo());
+                                            const QuantizationInfo       &out_quant_info     = QuantizationInfo(),
+                                            FastMathHint                  fast_math_hint     = FastMathHint::Disabled);
     /** Adds a generate proposals layer node to the graph
      *
      * @param[in] g       Graph to add the layer to
@@ -309,8 +392,23 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas,
-                                              NodeIdxPair anchors, GenerateProposalsInfo info);
+    static NodeID add_generate_proposals_node(Graph                &g,
+                                              NodeParams            params,
+                                              NodeIdxPair           scores,
+                                              NodeIdxPair           deltas,
+                                              NodeIdxPair           anchors,
+                                              GenerateProposalsInfo info);
+    /** Adds a L2 Normalize layer node to the graph
+     *
+     * @param[in] g       Graph to add the node to
+     * @param[in] params  Common node parameters
+     * @param[in] input   Input to the normalization layer node as a NodeID-Index pair
+     * @param[in] axis    Axis to perform normalization on
+     * @param[in] epsilon Lower bound value for the normalization
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_l2_normalize_node(Graph &g, NodeParams params, NodeIdxPair input, int axis, float epsilon);
     /** Adds a normalization layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -320,7 +418,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info);
+    static NodeID
+    add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info);
     /** Adds a normalize planar YUV layer node to the graph
      *
      * @param[in] g             Graph to add the node to
@@ -331,8 +430,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                ITensorAccessorUPtr mean_accessor = nullptr, ITensorAccessorUPtr std_accessor = nullptr);
+    static NodeID add_normalize_planar_yuv_node(Graph              &g,
+                                                NodeParams          params,
+                                                NodeIdxPair         input,
+                                                ITensorAccessorUPtr mean_accessor = nullptr,
+                                                ITensorAccessorUPtr std_accessor  = nullptr);
     /** Adds a pad layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -344,7 +446,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value = PixelValue());
+    static NodeID add_pad_node(Graph             &g,
+                               NodeParams         params,
+                               NodeIdxPair        input,
+                               const PaddingList &paddings,
+                               PixelValue         pad_value = PixelValue());
     /** Adds a permute layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -356,7 +462,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN);
+    static NodeID add_permute_node(Graph            &g,
+                                   NodeParams        params,
+                                   NodeIdxPair       input,
+                                   PermutationVector perm,
+                                   DataLayout        layout = DataLayout::UNKNOWN);
     /** Adds a pooling layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -388,8 +498,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(),
-                                 const std::function<ITensor *(ITensor *)> transform = nullptr);
+    static NodeID add_print_node(Graph                                    &g,
+                                 NodeParams                                params,
+                                 NodeIdxPair                               input,
+                                 std::ostream                             &stream,
+                                 const IOFormatInfo                       &format_info = IOFormatInfo(),
+                                 const std::function<ITensor *(ITensor *)> transform   = nullptr);
     /** Adds a priorbox layer node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -400,7 +514,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info);
+    static NodeID add_priorbox_node(
+        Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info);
     /** Adds a quantization layer node to the graph
      *
      * @param[in] g              Graph to add the node to
@@ -410,7 +525,21 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
+    static NodeID
+    add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info);
+    /** Adds a reduction sum layer node to the graph
+     *
+     * @param[in] g         Graph to add the node to
+     * @param[in] params    Common node parameters
+     * @param[in] input     Input to the reorg layer node as a NodeID-Index pair
+     * @param[in] op        Reduction operation
+     * @param[in] axis      Reduction axis
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_reduction_operation_node(
+        Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims = true);
     /** Adds a reorg layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -442,7 +571,12 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale);
+    static NodeID add_resize_node(Graph              &g,
+                                  NodeParams          params,
+                                  NodeIdxPair         input,
+                                  InterpolationPolicy policy,
+                                  float               width_scale,
+                                  float               height_scale);
     /** Adds a ROI align layer node to the graph
      *
      * @param[in] g         Graph to add the node to
@@ -453,7 +587,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info);
+    static NodeID
+    add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info);
     /** Adds a scale layer node to the graph
      * This layer computes a product of the input with a scale (read from mul_accessor) and it applies an offset (read from add_accessor).
      * output = input * mul_w + add_w
@@ -466,8 +601,11 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input,
-                                  ITensorAccessorUPtr mul_accessor = nullptr, ITensorAccessorUPtr add_accessor = nullptr);
+    static NodeID add_scale_layer(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  ITensorAccessorUPtr mul_accessor = nullptr,
+                                  ITensorAccessorUPtr add_accessor = nullptr);
     /** Adds a softmax node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -488,7 +626,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends);
+    static NodeID
+    add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends);
     /** Adds a split node to the graph
      *
      * @param[in] g          Graph to add the node to
@@ -499,7 +638,8 @@ public:
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis = 0);
+    static NodeID
+    add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis = 0);
     /** Adds a stack layer node to the graph
      *
      * @param[in] g      Graph to add the node to
@@ -510,28 +650,35 @@ public:
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_stack_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, int axis);
-    /** Adds an upsample layer to the graph
+    /** Adds a strided slice node to the graph
      *
-     * @param[in] g                 Graph to add the node to
-     * @param[in] params            Common node parameters
-     * @param[in] input             Input to the yolo layer node as a NodeID-Index pair
-     * @param[in] info              Upsample layer stride info
-     * @param[in] upsampling_policy Upsampling policy used
+     * @param[in] g       Graph to add the node to
+     * @param[in] params  Common node parameters
+     * @param[in] input   Input to the strided slice layer node as a NodeID-Index pair
+     * @param[in] starts  The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends    The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] info    Contains masks for the starts, ends and strides
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_upsample_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D info, InterpolationPolicy upsampling_policy);
+    static NodeID add_strided_slice_node(Graph                &g,
+                                         NodeParams            params,
+                                         NodeIdxPair           input,
+                                         Coordinates          &starts,
+                                         Coordinates          &ends,
+                                         BiStrides            &strides,
+                                         StridedSliceLayerInfo info);
     /** Adds a yolo layer to the graph
      *
-     * @param[in] g           Graph to add the node to
-     * @param[in] params      Common node parameters
-     * @param[in] input       Input to the yolo layer node as a NodeID-Index pair
-     * @param[in] act_info    Activation layer parameters
-     * @param[in] num_classes Number of classes to activate
+     * @param[in] g        Graph to add the node to
+     * @param[in] params   Common node parameters
+     * @param[in] input    Input to the yolo layer node as a NodeID-Index pair
+     * @param[in] act_info Activation layer parameters
      *
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
-    static NodeID add_yolo_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info, int32_t num_classes);
+    static NodeID add_yolo_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info);
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/GraphContext.h b/arm_compute/graph/GraphContext.h
index 973264afba..68fbaf5478 100644
--- a/arm_compute/graph/GraphContext.h
+++ b/arm_compute/graph/GraphContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_GRAPH_CONTEXT_H
 
 #include "arm_compute/graph/Types.h"
-
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 
@@ -39,18 +38,18 @@ namespace graph
 /** Contains structs required for memory management */
 struct MemoryManagerContext
 {
-    Target                                       target      = { Target::UNSPECIFIED }; /**< Target */
-    std::shared_ptr<arm_compute::IMemoryManager> intra_mm    = { nullptr };             /**< Intra-function memory manager */
-    std::shared_ptr<arm_compute::IMemoryManager> cross_mm    = { nullptr };             /**< Cross-function memory manager */
-    std::shared_ptr<arm_compute::IMemoryGroup>   cross_group = { nullptr };             /**< Cross-function memory group */
-    IAllocator                                  *allocator   = { nullptr };             /**< Backend allocator to use */
+    Target                                       target      = {Target::UNSPECIFIED}; /**< Target */
+    std::shared_ptr<arm_compute::IMemoryManager> intra_mm    = {nullptr}; /**< Intra-function memory manager */
+    std::shared_ptr<arm_compute::IMemoryManager> cross_mm    = {nullptr}; /**< Cross-function memory manager */
+    std::shared_ptr<arm_compute::IMemoryGroup>   cross_group = {nullptr}; /**< Cross-function memory group */
+    IAllocator                                  *allocator   = {nullptr}; /**< Backend allocator to use */
 };
 
 /** Contains structs required for weights management */
 struct WeightsManagerContext
 {
-    Target                                        target = { Target::UNSPECIFIED }; /**< Target */
-    std::shared_ptr<arm_compute::IWeightsManager> wm     = { nullptr };             /**< Weights manager */
+    Target                                        target = {Target::UNSPECIFIED}; /**< Target */
+    std::shared_ptr<arm_compute::IWeightsManager> wm     = {nullptr};             /**< Weights manager */
 };
 
 /** Graph context **/
@@ -125,7 +124,7 @@ public:
     void finalize();
 
 private:
-    GraphConfig _config;                                       /**< Graph configuration */
+    GraphConfig                             _config;           /**< Graph configuration */
     std::map<Target, MemoryManagerContext>  _memory_managers;  /**< Memory managers for each target */
     std::map<Target, WeightsManagerContext> _weights_managers; /**< Weights managers for each target */
 };
diff --git a/arm_compute/graph/GraphManager.h b/arm_compute/graph/GraphManager.h
index 23fa73254b..ae48e818ec 100644
--- a/arm_compute/graph/GraphManager.h
+++ b/arm_compute/graph/GraphManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/IDeviceBackend.h b/arm_compute/graph/IDeviceBackend.h
index d40d752952..8ae92e3177 100644
--- a/arm_compute/graph/IDeviceBackend.h
+++ b/arm_compute/graph/IDeviceBackend.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -88,7 +88,8 @@ public:
      *
      * @return Backend sub-tensor handle
      */
-    virtual std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) = 0;
+    virtual std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) = 0;
     /** Configure a backend Node
      *
      * @note This creates an appropriate configured backend function for the given node
@@ -118,6 +119,8 @@ public:
      * @return Weights manager
      */
     virtual std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() = 0;
+    /** Synchronize kernels execution on the backend. On GPU, this results in a blocking call waiting for all kernels to be completed. */
+    virtual void sync() = 0;
 };
 } // namespace backends
 } // namespace graph
diff --git a/arm_compute/graph/IGraphMutator.h b/arm_compute/graph/IGraphMutator.h
index 94a28f63b1..1c68b17c01 100644
--- a/arm_compute/graph/IGraphMutator.h
+++ b/arm_compute/graph/IGraphMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/IGraphPrinter.h b/arm_compute/graph/IGraphPrinter.h
index 73f3fa8ba3..9ee11d514f 100644
--- a/arm_compute/graph/IGraphPrinter.h
+++ b/arm_compute/graph/IGraphPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/INode.h b/arm_compute/graph/INode.h
index 5536bb97ab..5646ea81d2 100644
--- a/arm_compute/graph/INode.h
+++ b/arm_compute/graph/INode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_INODE_H
-#define ARM_COMPUTE_GRAPH_INODE_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODE_H
+#define ACL_ARM_COMPUTE_GRAPH_INODE_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/LayerDescriptors.h"
 #include "arm_compute/graph/TensorDescriptor.h"
 #include "arm_compute/graph/Types.h"
 
+#include <list>
 #include <set>
 
 namespace arm_compute
@@ -255,4 +256,4 @@ protected:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODE_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODE_H
diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index deaa8f1a51..efe191adfc 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_INODEVISITOR_H
-#define ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
+#define ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
 
 #include "arm_compute/graph/nodes/NodesFwd.h"
 
@@ -191,125 +191,41 @@ public:
 
 #ifndef DOXYGEN_SKIP_THIS
     // Inherited methods overridden
-    virtual void visit(INode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(ActivationLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(BatchNormalizationLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(ConcatenateLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(ConstNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(ConvolutionLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(DequantizationLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(DetectionOutputLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(DetectionPostProcessLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(DepthwiseConvolutionLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(EltwiseLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(FlattenLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(FullyConnectedLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(FusedConvolutionBatchNormalizationNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(FusedDepthwiseConvolutionBatchNormalizationNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(InputNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(NormalizationLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(OutputNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(PermuteLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(PoolingLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(PReluLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(PrintLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(PriorBoxLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(QuantizationLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(ReshapeLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(SoftmaxLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(SplitLayerNode &) override
-    {
-        default_visit();
-    }
-    virtual void visit(StackLayerNode &) override
-    {
-        default_visit();
-    }
+    virtual void visit(INode &n) override;
+    virtual void visit(ActivationLayerNode &n) override;
+    virtual void visit(BatchNormalizationLayerNode &n) override;
+    virtual void visit(ConcatenateLayerNode &n) override;
+    virtual void visit(ConstNode &n) override;
+    virtual void visit(ConvolutionLayerNode &n) override;
+    virtual void visit(DequantizationLayerNode &n) override;
+    virtual void visit(DetectionOutputLayerNode &n) override;
+    virtual void visit(DetectionPostProcessLayerNode &n) override;
+    virtual void visit(DepthwiseConvolutionLayerNode &n) override;
+    virtual void visit(EltwiseLayerNode &n) override;
+    virtual void visit(FlattenLayerNode &n) override;
+    virtual void visit(FullyConnectedLayerNode &n) override;
+    virtual void visit(FusedConvolutionBatchNormalizationNode &n) override;
+    virtual void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
+    virtual void visit(InputNode &n) override;
+    virtual void visit(NormalizationLayerNode &n) override;
+    virtual void visit(OutputNode &n) override;
+    virtual void visit(PermuteLayerNode &n) override;
+    virtual void visit(PoolingLayerNode &n) override;
+    virtual void visit(PReluLayerNode &n) override;
+    virtual void visit(PrintLayerNode &n) override;
+    virtual void visit(PriorBoxLayerNode &n) override;
+    virtual void visit(QuantizationLayerNode &n) override;
+    virtual void visit(ReshapeLayerNode &n) override;
+    virtual void visit(SoftmaxLayerNode &n) override;
+    virtual void visit(SplitLayerNode &n) override;
+    virtual void visit(StackLayerNode &n) override;
 #endif /* DOXYGEN_SKIP_THIS */
 
     /** Function to be overloaded by the client and implement default behavior for the
      *  non-overloaded visitors
      */
-    virtual void default_visit() = 0;
+    virtual void default_visit(INode &n) = 0;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_INODEVISITOR_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_INODEVISITOR_H
diff --git a/arm_compute/graph/ITensorAccessor.h b/arm_compute/graph/ITensorAccessor.h
index f7add3742a..a8818be2e5 100644
--- a/arm_compute/graph/ITensorAccessor.h
+++ b/arm_compute/graph/ITensorAccessor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019,2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -45,6 +45,14 @@ public:
      * @return True if access is successful else false
      */
     virtual bool access_tensor(ITensor &tensor) = 0;
+    /** Returns true if the tensor data is being accessed
+     *
+     * @return True if the tensor data is being accessed by the accessor. False otherwise
+     */
+    virtual bool access_tensor_data()
+    {
+        return true;
+    }
 };
 
 using ITensorAccessorUPtr = std::unique_ptr<ITensorAccessor>;
diff --git a/arm_compute/graph/ITensorHandle.h b/arm_compute/graph/ITensorHandle.h
index 1d78e78df5..0908d64818 100644
--- a/arm_compute/graph/ITensorHandle.h
+++ b/arm_compute/graph/ITensorHandle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/LayerDescriptors.h b/arm_compute/graph/LayerDescriptors.h
index d8e6a6a87b..d632ed9e78 100644
--- a/arm_compute/graph/LayerDescriptors.h
+++ b/arm_compute/graph/LayerDescriptors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,8 +37,7 @@ namespace descriptors
 struct ConcatLayerDescriptor
 {
     /** Default constructor */
-    ConcatLayerDescriptor()
-        : axis(DataLayoutDimension::CHANNEL), output_qinfo()
+    ConcatLayerDescriptor() : axis(DataLayoutDimension::CHANNEL), output_qinfo()
     {
     }
 
@@ -46,8 +45,7 @@ struct ConcatLayerDescriptor
      *
      * @param[in] axis Axis.
      */
-    ConcatLayerDescriptor(DataLayoutDimension axis)
-        : axis(axis), output_qinfo()
+    ConcatLayerDescriptor(DataLayoutDimension axis) : axis(axis), output_qinfo()
     {
     }
 
@@ -76,9 +74,16 @@ struct EltwiseLayerDescriptor
      * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
      * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
      */
-    EltwiseLayerDescriptor(EltwiseOperation op, QuantizationInfo out_quant_info = QuantizationInfo(), ConvertPolicy c_policy = ConvertPolicy::SATURATE, RoundingPolicy r_policy = RoundingPolicy::TO_ZERO,
+    EltwiseLayerDescriptor(EltwiseOperation    op,
+                           QuantizationInfo    out_quant_info   = QuantizationInfo(),
+                           ConvertPolicy       c_policy         = ConvertPolicy::SATURATE,
+                           RoundingPolicy      r_policy         = RoundingPolicy::TO_ZERO,
                            ActivationLayerInfo fused_activation = ActivationLayerInfo())
-        : op(op), out_quant_info(out_quant_info), c_policy(c_policy), r_policy(r_policy), fused_activation(fused_activation)
+        : op(op),
+          out_quant_info(out_quant_info),
+          c_policy(c_policy),
+          r_policy(r_policy),
+          fused_activation(fused_activation)
     {
     }
 
@@ -89,6 +94,37 @@ struct EltwiseLayerDescriptor
     ActivationLayerInfo fused_activation; /**< Fused activation info */
 };
 
+/** Unary Elementwise layer descriptor */
+struct UnaryEltwiseLayerDescriptor
+{
+    /** Constructor
+     *
+     * @param[in] op               Unary element-wise operation to perform
+     * @param[in] out_quant_info   (Optional) Output quantization information. Defaults to empty @ref QuantizationInfo
+     * @param[in] c_policy         (Optional) Convert policy used for the operation. Defaults to @ref ConvertPolicy::SATURATE
+     * @param[in] r_policy         (Optional) Rounding policy used for the operation. Defaults to @ref RoundingPolicy::TO_ZERO
+     * @param[in] fused_activation (Optional) Fused activation information. Defaults to empty (identity) @ref ActivationLayerInfo
+     */
+    UnaryEltwiseLayerDescriptor(UnaryEltwiseOperation op,
+                                QuantizationInfo      out_quant_info   = QuantizationInfo(),
+                                ConvertPolicy         c_policy         = ConvertPolicy::SATURATE,
+                                RoundingPolicy        r_policy         = RoundingPolicy::TO_ZERO,
+                                ActivationLayerInfo   fused_activation = ActivationLayerInfo())
+        : op(op),
+          out_quant_info(out_quant_info),
+          c_policy(c_policy),
+          r_policy(r_policy),
+          fused_activation(fused_activation)
+    {
+    }
+
+    UnaryEltwiseOperation op;               /**< Unary element-wise operation to perform */
+    QuantizationInfo      out_quant_info;   /**< Output quantization information */
+    ConvertPolicy         c_policy;         /**< Convert policy */
+    RoundingPolicy        r_policy;         /**< Rounding policy */
+    ActivationLayerInfo   fused_activation; /**< Fused activation info */
+};
+
 /** Deconvolution layer descriptor */
 struct DeconvolutionLayerDescriptor
 {
@@ -105,7 +141,7 @@ struct DeconvolutionLayerDescriptor
     PadStrideInfo    info;           /**< Padding and stride information */
     QuantizationInfo out_quant_info; /**< Output quantization information */
 };
-} // namespace descriptor
+} // namespace descriptors
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_LAYER_DESCRIPTORS_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_LAYER_DESCRIPTORS_H */
diff --git a/arm_compute/graph/Logger.h b/arm_compute/graph/Logger.h
index 8aa87f0927..e83d5f4ddc 100644
--- a/arm_compute/graph/Logger.h
+++ b/arm_compute/graph/Logger.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,14 +31,14 @@
  *
  * @note It will eventually create all default loggers in don't exist
  */
-#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                  \
-    do                                                                             \
-    {                                                                              \
-        if(arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
-        {                                                                          \
-            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers(); \
-        }                                                                          \
-    } while(false)
+#define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()                                   \
+    do                                                                              \
+    {                                                                               \
+        if (arm_compute::logging::LoggerRegistry::get().logger("GRAPH") == nullptr) \
+        {                                                                           \
+            arm_compute::logging::LoggerRegistry::get().create_reserved_loggers();  \
+        }                                                                           \
+    } while (false)
 #else /* ARM_COMPUTE_LOGGING_ENABLED */
 #define ARM_COMPUTE_CREATE_DEFAULT_GRAPH_LOGGER()
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
diff --git a/arm_compute/graph/PassManager.h b/arm_compute/graph/PassManager.h
index c8920ba296..efdf6ab98f 100644
--- a/arm_compute/graph/PassManager.h
+++ b/arm_compute/graph/PassManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/Tensor.h b/arm_compute/graph/Tensor.h
index 42d5d10735..0ffae28ecc 100644
--- a/arm_compute/graph/Tensor.h
+++ b/arm_compute/graph/Tensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_GRAPH_TENSOR_H
 #define ARM_COMPUTE_GRAPH_TENSOR_H
 
-#include "arm_compute/graph/Types.h"
-
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/ITensorHandle.h"
 #include "arm_compute/graph/TensorDescriptor.h"
+#include "arm_compute/graph/Types.h"
 
 #include <memory>
 #include <set>
diff --git a/arm_compute/graph/TensorDescriptor.h b/arm_compute/graph/TensorDescriptor.h
index 0cd892b935..46a6ab2c27 100644
--- a/arm_compute/graph/TensorDescriptor.h
+++ b/arm_compute/graph/TensorDescriptor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,8 +26,7 @@
 
 #include "arm_compute/graph/Types.h"
 
-#include "arm_compute/core/utils/misc/ICloneable.h"
-#include "support/MemorySupport.h"
+#include "support/ICloneable.h"
 
 #include <memory>
 
@@ -53,7 +52,11 @@ struct TensorDescriptor final : public misc::ICloneable<TensorDescriptor>
                      QuantizationInfo tensor_quant_info  = QuantizationInfo(),
                      DataLayout       tensor_data_layout = DataLayout::NCHW,
                      Target           tensor_target      = Target::UNSPECIFIED)
-        : shape(tensor_shape), data_type(tensor_data_type), layout(tensor_data_layout), quant_info(tensor_quant_info), target(tensor_target)
+        : shape(tensor_shape),
+          data_type(tensor_data_type),
+          layout(tensor_data_layout),
+          quant_info(tensor_quant_info),
+          target(tensor_target)
     {
     }
     /** Sets tensor descriptor shape
@@ -104,14 +107,14 @@ struct TensorDescriptor final : public misc::ICloneable<TensorDescriptor>
     // Inherited methods overridden:
     std::unique_ptr<TensorDescriptor> clone() const override
     {
-        return support::cpp14::make_unique<TensorDescriptor>(*this);
+        return std::make_unique<TensorDescriptor>(*this);
     }
 
-    TensorShape      shape{};                        /**< Tensor shape */
-    DataType         data_type{ DataType::UNKNOWN }; /**< Data type */
-    DataLayout       layout{ DataLayout::NCHW };     /**< Data layout */
-    QuantizationInfo quant_info{};                   /**< Quantization info */
-    Target           target{ Target::UNSPECIFIED };  /**< Target */
+    TensorShape      shape{};                      /**< Tensor shape */
+    DataType         data_type{DataType::UNKNOWN}; /**< Data type */
+    DataLayout       layout{DataLayout::NCHW};     /**< Data layout */
+    QuantizationInfo quant_info{};                 /**< Quantization info */
+    Target           target{Target::UNSPECIFIED};  /**< Target */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/TypeLoader.h b/arm_compute/graph/TypeLoader.h
index e1b920c24d..286bfebeb5 100644
--- a/arm_compute/graph/TypeLoader.h
+++ b/arm_compute/graph/TypeLoader.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,29 +30,6 @@
 
 namespace arm_compute
 {
-/** Converts a string to a strong types enumeration @ref DataType
- *
- * @param[in] name String to convert
- *
- * @return Converted DataType enumeration
- */
-arm_compute::DataType data_type_from_name(const std::string &name);
-
-/** Input Stream operator for @ref DataType
- *
- * @param[in]  stream    Stream to parse
- * @param[out] data_type Output data type
- *
- * @return Updated stream
- */
-inline ::std::istream &operator>>(::std::istream &stream, arm_compute::DataType &data_type)
-{
-    std::string value;
-    stream >> value;
-    data_type = data_type_from_name(value);
-    return stream;
-}
-
 /** Converts a string to a strong types enumeration @ref DataLayout
  *
  * @param[in] name String to convert
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index d56407b52b..5e83820ab3 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
-#define ARM_COMPUTE_GRAPH_TYPE_PRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
@@ -37,19 +37,19 @@ namespace graph
 /** Formatted output of the Target. */
 inline ::std::ostream &operator<<(::std::ostream &os, const Target &target)
 {
-    switch(target)
+    switch (target)
     {
         case Target::UNSPECIFIED:
             os << "UNSPECIFIED";
             break;
         case Target::NEON:
-            os << "NEON";
+            os << "Neon";
             break;
         case Target::CL:
             os << "CL";
             break;
-        case Target::GC:
-            os << "GC";
+        case Target::CLVK:
+            os << "CLVK";
             break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
@@ -60,11 +60,14 @@ inline ::std::ostream &operator<<(::std::ostream &os, const Target &target)
 
 inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
 {
-    switch(node_type)
+    switch (node_type)
     {
         case NodeType::ActivationLayer:
             os << "ActivationLayer";
             break;
+        case NodeType::ArgMinMaxLayer:
+            os << "ArgMinMaxLayer";
+            break;
         case NodeType::BatchNormalizationLayer:
             os << "BatchNormalizationLayer";
             break;
@@ -83,6 +86,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::DeconvolutionLayer:
             os << "DeconvolutionLayer";
             break;
+        case NodeType::DepthToSpaceLayer:
+            os << "DepthToSpaceLayer";
+            break;
         case NodeType::DequantizationLayer:
             os << "DequantizationLayer";
             break;
@@ -98,6 +104,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::EltwiseLayer:
             os << "EltwiseLayer";
             break;
+        case NodeType::UnaryEltwiseLayer:
+            os << "UnaryEltwiseLayer";
+            break;
         case NodeType::FlattenLayer:
             os << "FlattenLayer";
             break;
@@ -113,6 +122,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::GenerateProposalsLayer:
             os << "GenerateProposalsLayer";
             break;
+        case NodeType::L2NormalizeLayer:
+            os << "L2NormalizeLayer";
+            break;
         case NodeType::NormalizationLayer:
             os << "NormalizationLayer";
             break;
@@ -140,6 +152,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::QuantizationLayer:
             os << "QuantizationLayer";
             break;
+        case NodeType::ReductionOperationLayer:
+            os << "ReductionOperationLayer";
+            break;
         case NodeType::ReorgLayer:
             os << "ReorgLayer";
             break;
@@ -164,12 +179,12 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
         case NodeType::StackLayer:
             os << "StackLayer";
             break;
+        case NodeType::StridedSliceLayer:
+            os << "StridedSliceLayer";
+            break;
         case NodeType::UpsampleLayer:
             os << "UpsampleLayer";
             break;
-        case NodeType::YOLOLayer:
-            os << "YOLOLayer";
-            break;
         case NodeType::Input:
             os << "Input";
             break;
@@ -192,7 +207,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const NodeType &node_type)
 /** Formatted output of the EltwiseOperation type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &eltwise_op)
 {
-    switch(eltwise_op)
+    switch (eltwise_op)
     {
         case EltwiseOperation::Add:
             os << "Add";
@@ -203,6 +218,9 @@ inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &el
         case EltwiseOperation::Sub:
             os << "Sub";
             break;
+        case EltwiseOperation::Div:
+            os << "Div";
+            break;
         default:
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
@@ -213,7 +231,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const EltwiseOperation &el
 /** Formatted output of the ConvolutionMethod type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &method)
 {
-    switch(method)
+    switch (method)
     {
         case ConvolutionMethod::Default:
             os << "Default";
@@ -237,7 +255,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const ConvolutionMethod &m
 /** Formatted output of the FastMathHint type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 {
-    switch(hint)
+    switch (hint)
     {
         case FastMathHint::Enabled:
             os << "Enabled";
@@ -255,7 +273,7 @@ inline ::std::ostream &operator<<(::std::ostream &os, const FastMathHint &hint)
 /** Formatted output of the DepthwiseConvolutionMethod type. */
 inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolutionMethod &method)
 {
-    switch(method)
+    switch (method)
     {
         case DepthwiseConvolutionMethod::Default:
             os << "DEFAULT";
@@ -271,4 +289,4 @@ inline ::std::ostream &operator<<(::std::ostream &os, const DepthwiseConvolution
 }
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPE_PRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPEPRINTER_H
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 296f757c9b..5541e3cbcc 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,18 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_TYPES_H
-#define ARM_COMPUTE_GRAPH_TYPES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_TYPES_H
+#define ACL_ARM_COMPUTE_GRAPH_TYPES_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/ConvolutionInfo.h"
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
 
 #include <limits>
 #include <string>
@@ -36,30 +41,31 @@ namespace arm_compute
 {
 namespace graph
 {
+using arm_compute::CLBackendType;
 using arm_compute::CLTunerMode;
 using arm_compute::Status;
 
 using arm_compute::Coordinates;
-using arm_compute::DataType;
 using arm_compute::DataLayout;
 using arm_compute::DataLayoutDimension;
-using arm_compute::TensorShape;
-using arm_compute::Size2D;
+using arm_compute::DataType;
 using arm_compute::PermutationVector;
 using arm_compute::PixelValue;
+using arm_compute::Size2D;
+using arm_compute::TensorShape;
 
 using arm_compute::ActivationLayerInfo;
 using arm_compute::DetectionOutputLayerInfo;
 using arm_compute::DetectionPostProcessLayerInfo;
-using arm_compute::NormType;
-using arm_compute::NormalizationLayerInfo;
+using arm_compute::DimensionRoundingType;
 using arm_compute::FullyConnectedLayerInfo;
+using arm_compute::InterpolationPolicy;
+using arm_compute::NormalizationLayerInfo;
+using arm_compute::NormType;
 using arm_compute::PadStrideInfo;
 using arm_compute::PoolingLayerInfo;
 using arm_compute::PoolingType;
 using arm_compute::PriorBoxLayerInfo;
-using arm_compute::DimensionRoundingType;
-using arm_compute::InterpolationPolicy;
 
 using GraphID    = unsigned int;
 using TensorID   = unsigned int;
@@ -76,26 +82,31 @@ constexpr EdgeID EmptyEdgeID = std::numeric_limits<EdgeID>::max();
 
 // Forward declarations
 struct TensorDescriptor;
+
 /** Graph configuration structure */
 struct GraphConfig
 {
-    bool        use_function_memory_manager{ true };   /**< Use a memory manager to manage per-funcion auxilary memory */
-    bool        use_function_weights_manager{ true };  /**< Use a weights manager to manage transformed weights */
-    bool        use_transition_memory_manager{ true }; /**< Use a memory manager to manager transition buffer memory */
-    bool        use_tuner{ false };                    /**< Use a tuner in tunable backends */
-    bool        convert_to_uint8{ false };             /**< Convert graph to a synthetic uint8 graph */
-    CLTunerMode tuner_mode{ CLTunerMode::EXHAUSTIVE }; /**< Tuner mode to be used by the CL tuner */
-    int         num_threads{ -1 };                     /**< Number of threads to use (thread capable backends), if 0 the backend will auto-initialize, if -1 the backend will stay as it is. */
-    std::string tuner_file{ "acl_tuner.csv" };         /**< File to load/store tuning values from */
+    bool        use_function_memory_manager{true};   /**< Use a memory manager to manage per-function auxilary memory */
+    bool        use_function_weights_manager{true};  /**< Use a weights manager to manage transformed weights */
+    bool        use_transition_memory_manager{true}; /**< Use a memory manager to manager transition buffer memory */
+    bool        use_tuner{false};                    /**< Use a tuner in tunable backends */
+    bool        use_synthetic_type{false};           /**< Convert graph to a synthetic graph for a data type */
+    DataType    synthetic_type{DataType::QASYMM8};   /**< The data type of the synthetic graph  */
+    CLTunerMode tuner_mode{CLTunerMode::EXHAUSTIVE}; /**< Tuner mode to be used by the CL tuner */
+    int         num_threads{
+        -1}; /**< Number of threads to use (thread capable backends), if 0 the backend will auto-initialize, if -1 the backend will stay as it is. */
+    std::string   tuner_file{"acl_tuner.csv"};         /**< File to load/store tuning values from */
+    std::string   mlgo_file{"heuristics.mlgo"};        /**< Filename to load MLGO heuristics from */
+    CLBackendType backend_type{CLBackendType::Native}; /**< CL backend type to use */
 };
 
 /**< Device target types */
 enum class Target
 {
     UNSPECIFIED, /**< Unspecified Target */
-    NEON,        /**< NEON capable target device */
+    NEON,        /**< Arm® Neon™ capable target device */
     CL,          /**< OpenCL capable target device */
-    GC,          /**< GLES compute capable target device */
+    CLVK,        /**< CLVK capable target device */
 };
 
 /** Supported Element-wise operations */
@@ -103,7 +114,16 @@ enum class EltwiseOperation
 {
     Add, /**< Arithmetic addition */
     Sub, /**< Arithmetic subtraction */
-    Mul  /**< Arithmetic multiplication */
+    Mul, /**< Arithmetic multiplication */
+    Max, /**< Arithmetic maximum */
+    Div, /**< Arithmetic division */
+    Min, /**< Arithmetic minimum */
+};
+
+/** Supported Unary Element-wise operations */
+enum class UnaryEltwiseOperation
+{
+    Exp /**< Exp */
 };
 
 /** Supported Convolution layer methods */
@@ -134,12 +154,14 @@ enum class FastMathHint
 enum class NodeType
 {
     ActivationLayer,
+    ArgMinMaxLayer,
     BatchNormalizationLayer,
     BoundingBoxTransformLayer,
     ChannelShuffleLayer,
     ConcatenateLayer,
     ConvolutionLayer,
     DeconvolutionLayer,
+    DepthToSpaceLayer,
     DepthwiseConvolutionLayer,
     DequantizationLayer,
     DetectionOutputLayer,
@@ -150,6 +172,7 @@ enum class NodeType
     FusedConvolutionBatchNormalizationLayer,
     FusedDepthwiseConvolutionBatchNormalizationLayer,
     GenerateProposalsLayer,
+    L2NormalizeLayer,
     NormalizationLayer,
     NormalizePlanarYUVLayer,
     PadLayer,
@@ -159,6 +182,7 @@ enum class NodeType
     PrintLayer,
     PriorBoxLayer,
     QuantizationLayer,
+    ReductionOperationLayer,
     ReorgLayer,
     ReshapeLayer,
     ResizeLayer,
@@ -167,8 +191,9 @@ enum class NodeType
     SliceLayer,
     SplitLayer,
     StackLayer,
+    StridedSliceLayer,
     UpsampleLayer,
-    YOLOLayer,
+    UnaryEltwiseLayer,
 
     Input,
     Output,
@@ -202,4 +227,4 @@ struct NodeParams
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_TYPES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_TYPES_H
diff --git a/arm_compute/graph/Utils.h b/arm_compute/graph/Utils.h
index cb421fc55a..9813ff05c7 100644
--- a/arm_compute/graph/Utils.h
+++ b/arm_compute/graph/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@ class GraphContext;
 
 inline bool is_utility_node(INode *node)
 {
-    std::set<NodeType> utility_node_types = { NodeType::PrintLayer };
+    std::set<NodeType> utility_node_types = {NodeType::PrintLayer};
     return utility_node_types.find(node->type()) != utility_node_types.end();
 }
 
@@ -76,7 +76,7 @@ bool is_target_supported(Target target);
 /** Returns default target for execution
  *
  * @note If an OpenCL backend exists then OpenCL is returned,
- *       else if the NEON backend exists returns NEON as target.
+ *       else if the CPU backend exists returns @ref Target::NEON as target.
  *       If no backends are registered an error is raised.
  *
  * @return Default target
@@ -107,6 +107,8 @@ void setup_requested_backend_context(GraphContext &ctx, Target target);
  * @param[in,out] ctx Graph Context
  */
 void release_default_graph_context(GraphContext &ctx);
+/** Synchronize kernels execution on the backends. On GPU, this results in a blocking call waiting for all kernels to be completed. */
+void sync_backends();
 /** Get size of a tensor's given dimension depending on its layout
  *
  * @param[in] descriptor            Descriptor
@@ -130,6 +132,13 @@ size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_
  * @return A list with the driving node of a given node
  */
 std::vector<NodeIdxPair> get_driving_nodes(const INode &node);
+/** Get the list of driver nodes of a given node
+ *
+ * @param[in] node Node to find the driver node of
+ *
+ * @return A list with the driver node of a given node
+ */
+std::vector<NodeIdxPair> get_driver_nodes(const INode &node);
 /** Configures tensor
  *
  * @param[in, out] tensor Tensor to configure
diff --git a/arm_compute/graph/Workload.h b/arm_compute/graph/Workload.h
index a36c28a181..8ff0a548ae 100644
--- a/arm_compute/graph/Workload.h
+++ b/arm_compute/graph/Workload.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,8 +69,7 @@ public:
  */
 struct ExecutionTask
 {
-    ExecutionTask(std::unique_ptr<arm_compute::IFunction> &&f, INode *n)
-        : task(std::move(f)), node(n)
+    ExecutionTask(std::unique_ptr<arm_compute::IFunction> &&f, INode *n) : task(std::move(f)), node(n)
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -97,11 +96,11 @@ struct ExecutionTask
 /** Execution workload */
 struct ExecutionWorkload
 {
-    std::vector<Tensor *>      inputs  = {};          /**< Input handles */
-    std::vector<Tensor *>      outputs = {};          /**< Output handles */
-    std::vector<ExecutionTask> tasks   = {};          /**< Execution workload */
-    Graph                     *graph   = { nullptr }; /**< Graph bound to the workload */
-    GraphContext              *ctx     = { nullptr }; /**< Graph execution context */
+    std::vector<Tensor *>      inputs  = {};        /**< Input handles */
+    std::vector<Tensor *>      outputs = {};        /**< Output handles */
+    std::vector<ExecutionTask> tasks   = {};        /**< Execution workload */
+    Graph                     *graph   = {nullptr}; /**< Graph bound to the workload */
+    GraphContext              *ctx     = {nullptr}; /**< Graph execution context */
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/algorithms/Algorithms.h b/arm_compute/graph/algorithms/Algorithms.h
index 89441c854f..2088d38fd7 100644
--- a/arm_compute/graph/algorithms/Algorithms.h
+++ b/arm_compute/graph/algorithms/Algorithms.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/algorithms/TopologicalSort.h b/arm_compute/graph/algorithms/TopologicalSort.h
index 486c0b5aca..25476e28f3 100644
--- a/arm_compute/graph/algorithms/TopologicalSort.h
+++ b/arm_compute/graph/algorithms/TopologicalSort.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/backends/BackendRegistrar.h b/arm_compute/graph/backends/BackendRegistrar.h
index 5d1582dc9d..2879361fef 100644
--- a/arm_compute/graph/backends/BackendRegistrar.h
+++ b/arm_compute/graph/backends/BackendRegistrar.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,8 @@
 #ifndef ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H
 #define ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H
 
-#include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Types.h"
 
 #include <utility>
 
@@ -58,4 +58,4 @@ inline BackendRegistrar<T>::BackendRegistrar(Target target)
 } // namespace backends
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_BACKEND_REGISTRAR_H */
diff --git a/arm_compute/graph/backends/BackendRegistry.h b/arm_compute/graph/backends/BackendRegistry.h
index c9e84bdcca..7c11d35faf 100644
--- a/arm_compute/graph/backends/BackendRegistry.h
+++ b/arm_compute/graph/backends/BackendRegistry.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "arm_compute/graph/IDeviceBackend.h"
 #include "arm_compute/graph/Types.h"
-#include "support/MemorySupport.h"
 
 #include <map>
 #include <memory>
@@ -93,7 +92,7 @@ private:
 template <typename T>
 inline void BackendRegistry::add_backend(Target target)
 {
-    _registered_backends[target] = support::cpp14::make_unique<T>();
+    _registered_backends[target] = std::make_unique<T>();
 }
 } // namespace backends
 } // namespace graph
diff --git a/arm_compute/graph/backends/CL/CLDeviceBackend.h b/arm_compute/graph/backends/CL/CLDeviceBackend.h
index 492dca0682..09e19d7688 100644
--- a/arm_compute/graph/backends/CL/CLDeviceBackend.h
+++ b/arm_compute/graph/backends/CL/CLDeviceBackend.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,9 +25,10 @@
 #define ARM_COMPUTE_GRAPH_CLDEVICEBACKEND_H
 
 #include "arm_compute/graph/IDeviceBackend.h"
-
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
+#include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
 
 namespace arm_compute
 {
@@ -57,23 +58,27 @@ public:
     void set_kernel_tuning_mode(CLTunerMode tuning_mode);
 
     // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
+    void                           initialize_backend() override;
+    void                           setup_backend_context(GraphContext &ctx) override;
+    void                           release_backend_context(GraphContext &ctx) override;
     bool                           is_backend_supported() override;
     IAllocator                    *backend_allocator() override;
     std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
+    std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
+    std::unique_ptr<arm_compute::IFunction>       configure_node(INode &node, GraphContext &ctx) override;
+    Status                                        validate_node(INode &node) override;
+    std::shared_ptr<arm_compute::IMemoryManager>  create_memory_manager(MemoryManagerAffinity affinity) override;
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
+    void                                          sync() override;
 
 private:
     int                                _context_count; /**< Counts how many contexts are currently using the backend */
     CLTuner                            _tuner;         /**< CL kernel tuner */
-    std::unique_ptr<CLBufferAllocator> _allocator;     /**< CL buffer affinity allocator */
-    std::string                        _tuner_file;    /**< Filename to load/store the tuner's values from */
+    CLGEMMHeuristicsHandle             _gemm_heuristics; /**< GEMM heuristics */
+    std::unique_ptr<CLBufferAllocator> _allocator;       /**< CL buffer affinity allocator */
+    std::string                        _tuner_file;      /**< Filename to load/store the tuner's values from */
+    CLBackendType                      _backend_type;    /**< OpenCL backend type to use */
 };
 } // namespace backends
 } // namespace graph
diff --git a/arm_compute/graph/backends/CL/CLFunctionFactory.h b/arm_compute/graph/backends/CL/CLFunctionFactory.h
index 264612c8b7..e832f4547f 100644
--- a/arm_compute/graph/backends/CL/CLFunctionFactory.h
+++ b/arm_compute/graph/backends/CL/CLFunctionFactory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/backends/CL/CLNodeValidator.h b/arm_compute/graph/backends/CL/CLNodeValidator.h
index 2f43cd4696..6e102a3b21 100644
--- a/arm_compute/graph/backends/CL/CLNodeValidator.h
+++ b/arm_compute/graph/backends/CL/CLNodeValidator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/backends/CL/CLSubTensorHandle.h b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
index 3379febd9c..85eebec639 100644
--- a/arm_compute/graph/backends/CL/CLSubTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLSubTensorHandle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLSUBTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/CL/CLSubTensor.h"
 
 namespace arm_compute
@@ -45,7 +44,10 @@ public:
      * @param[in] coords        Starting coordinates
      * @param[in] extend_parent Extends parent shape if true
      */
-    CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent = false);
+    CLSubTensorHandle(ITensorHandle     *parent_handle,
+                      const TensorShape &shape,
+                      const Coordinates &coords,
+                      bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensorHandle() = default;
     /** Allow instances of this class to be move constructed */
@@ -58,10 +60,10 @@ public:
     CLSubTensorHandle &operator=(const CLSubTensorHandle &) = delete;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/CL/CLTensorHandle.h b/arm_compute/graph/backends/CL/CLTensorHandle.h
index 1452ef822c..57e9794ec3 100644
--- a/arm_compute/graph/backends/CL/CLTensorHandle.h
+++ b/arm_compute/graph/backends/CL/CLTensorHandle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_CLTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
@@ -51,10 +50,10 @@ public:
     CLTensorHandle &operator=(CLTensorHandle &&) = default;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 382b18a888..fd8b6b5a69 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,23 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
 
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
+#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h"
-#include "arm_compute/graph/backends/Utils.h"
-#include "arm_compute/graph/nodes/Nodes.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/core/utils/misc/Cast.h"
+#include "support/Cast.h"
 
 namespace arm_compute
 {
@@ -47,13 +47,6 @@ namespace backends
 {
 namespace detail
 {
-// Address rule DR-9R5 (1579. Return by converting move constructor)
-#if defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5))
-#define RETURN_UNIQUE_PTR(x) (x)
-#else /* defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5)) */
-#define RETURN_UNIQUE_PTR(x) (std::move(x))
-#endif /* defined(__clang__) || (defined(__GNUC__) && (__GNUC__ >= 5)) */
-
 /** Returns backing tensor of a given tensor
  *
  * @tparam TargetInfo Target information
@@ -66,13 +59,16 @@ template <typename TargetInfo>
 typename TargetInfo::TensorType *get_backing_tensor(arm_compute::graph::Tensor *tensor)
 {
     typename TargetInfo::TensorType *backing_tensor = nullptr;
-    if(tensor != nullptr)
+    if (tensor != nullptr)
     {
         ARM_COMPUTE_ERROR_ON(tensor->desc().target != TargetInfo::TargetType);
         // Get backing tensor handle
         ITensorHandle *tensor_handle = tensor->handle();
         // Get backing tensor
-        backing_tensor = (tensor_handle != nullptr) ? arm_compute::utils::cast::polymorphic_cast<typename TargetInfo::TensorType *>(&tensor_handle->tensor()) : nullptr;
+        backing_tensor = (tensor_handle != nullptr)
+                             ? arm_compute::utils::cast::polymorphic_cast<typename TargetInfo::TensorType *>(
+                                   &tensor_handle->tensor())
+                             : nullptr;
     }
 
     return backing_tensor;
@@ -81,11 +77,8 @@ typename TargetInfo::TensorType *get_backing_tensor(arm_compute::graph::Tensor *
 template <typename TargetInfo>
 void validate_node(const INode &node, size_t num_expected_inputs, size_t num_expected_outputs)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating " << node.type()
-                                  << " Target: " << TargetInfo::TargetType
-                                  << " ID: " << node.id()
-                                  << node.name()
-                                  << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating " << node.type() << " Target: " << TargetInfo::TargetType
+                                              << " ID: " << node.id() << node.name() << std::endl);
 
     ARM_COMPUTE_ERROR_ON(TargetInfo::TargetType != node.assigned_target());
     ARM_COMPUTE_ERROR_ON(node.num_inputs() != num_expected_inputs);
@@ -113,22 +106,48 @@ std::unique_ptr<IFunction> create_activation_layer(ActivationLayerNode &node)
     const ActivationLayerInfo        act_info = node.activation_info();
 
     // Create function
-    auto func = support::cpp14::make_unique<ActivationLayerFunction>();
+    auto func = std::make_unique<ActivationLayerFunction>();
     func->configure(input, output, act_info);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Activation function: " << act_info.activation()
-                               << " a: " << act_info.a()
-                               << " b: " << act_info.b()
-                               << " InPlace : " << is_in_place_operation(input, output)
-                               << std::endl);
-
-    return RETURN_UNIQUE_PTR(func);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << input->info()->data_type() << " Shape: " << input->info()->tensor_shape()
+                        << " Activation function: " << act_info.activation() << " a: " << act_info.a() << " b: "
+                        << act_info.b() << " InPlace : " << is_in_place_operation(input, output) << std::endl);
+
+    return func;
+}
+
+/** Creates a backend argminmax layer function
+ *
+ * @tparam ArgMinMaxLayerFunction Backend activation function
+ * @tparam TargetInfo             Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend argminmax layer function
+ */
+template <typename ArgMinMaxLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_arg_min_max_layer(ArgMinMaxLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input  = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+    const ReductionOperation         op     = node.reduction_operation();
+    unsigned int                     axis   = node.axis();
+
+    // Create function
+    auto func = std::make_unique<ArgMinMaxLayerFunction>();
+    func->configure(input, axis, output, op);
+
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape()
+                                               << " Reduction Operation: " << op << " axis: " << axis << std::endl);
+
+    return func;
 }
 
 /** Create a backend batch normalization layer function
@@ -157,22 +176,17 @@ std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLa
     const ActivationLayerInfo        fused_act = node.fused_activation();
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<BatchNormalizationLayerFunction>();
+    auto func = std::make_unique<BatchNormalizationLayerFunction>();
     func->configure(input, output, mean, var, beta, gamma, epsilon, fused_act);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Epsilon: " << epsilon << " "
-                               << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
-                               << " InPlace: " << is_in_place_operation(input, output)
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << " Epsilon: " << epsilon
+                                               << " " << (fused_act.enabled() ? to_string(fused_act.activation()) : "")
+                                               << " InPlace: " << is_in_place_operation(input, output) << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend batch normalization layer function
@@ -186,7 +200,8 @@ std::unique_ptr<IFunction> create_batch_normalization_layer(BatchNormalizationLa
  * @return Backend batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction>
+create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -216,20 +231,17 @@ std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(Fu
 
     // Create and configure function
     std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
+        std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta,
+        gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return RETURN_UNIQUE_PTR(func);
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend fused depthwise convolution batch normalization layer function
@@ -243,7 +255,9 @@ std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(Fu
  * @return Backend fused depthwise convolution batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction>
+create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node,
+                                                             GraphContext                                    &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -272,20 +286,17 @@ std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalizatio
 
     // Create and configure function
     std::tie(func, func_name) = create_named_memory_managed_function<FType>(
-                                    std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
+        std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var,
+        beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return RETURN_UNIQUE_PTR(func);
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend bounding box transform layer function
@@ -309,21 +320,17 @@ std::unique_ptr<IFunction> create_bounding_box_transform_layer(BoundingBoxTransf
     const BoundingBoxTransformInfo   bbox_info = node.info();
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<BoundingBoxTransformLayerFunction>();
+    auto func = std::make_unique<BoundingBoxTransformLayerFunction>();
     func->configure(input, output, deltas, bbox_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " BoundingBox Info img W: " << bbox_info.img_width() << " "
-                               << " BoundingBox Info img H: " << bbox_info.img_height() << " "
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << input->info()->data_type() << " Shape: " << input->info()->tensor_shape()
+                        << " BoundingBox Info img W: " << bbox_info.img_width() << " "
+                        << " BoundingBox Info img H: " << bbox_info.img_height() << " " << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return std::move(func);
 }
 
 /** Create a backend channel shuffle layer function
@@ -346,19 +353,15 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
     const unsigned int               num_groups = node.num_groups();
 
     // Create function
-    auto func = support::cpp14::make_unique<ChannelShuffleLayerFunction>();
+    auto func = std::make_unique<ChannelShuffleLayerFunction>();
     func->configure(input, output, num_groups);
 
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << " Num groups: " << num_groups
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape()
+                                               << " Num groups: " << num_groups << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend layer concatenate function
@@ -373,54 +376,49 @@ std::unique_ptr<IFunction> create_channel_shuffle_layer(ChannelShuffleLayerNode
 template <typename ConcatenateLayerFunction, typename TargetInfo>
 std::unique_ptr<arm_compute::IFunction> create_concatenate_layer(ConcatenateLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Concatenate node with ID : " << node.id() << " and Name: " << node.name()
+                                                                         << std::endl);
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Return nullptr if depth concatenate is switched off
-    if(!node.is_enabled())
+    if (!node.is_enabled())
     {
         return nullptr;
     }
 
     // Extract IO and info
-    std::vector<typename TargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    std::vector<typename TargetInfo::SrcTensorType *> inputs;
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
     }
-    typename TargetInfo::TensorType *output      = get_backing_tensor<TargetInfo>(node.output(0));
-    const DataLayout                 data_layout = node.output(0) != nullptr ? node.output(0)->desc().layout : DataLayout::UNKNOWN;
-    const size_t                     concat_axis = get_dimension_idx(data_layout, node.concatenation_axis());
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+    const DataLayout data_layout = node.output(0) != nullptr ? node.output(0)->desc().layout : DataLayout::UNKNOWN;
+    const size_t     concat_axis = get_dimension_idx(data_layout, node.concatenation_axis());
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<ConcatenateLayerFunction>();
+    auto func = std::make_unique<ConcatenateLayerFunction>();
     func->configure(inputs, output, concat_axis);
 
     // Log info
     const bool         is_quantized = is_data_type_quantized_asymmetric(output->info()->data_type());
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << " Axis: " << concat_axis
-                               << qss.str()
-                               << std::endl);
-
-    return RETURN_UNIQUE_PTR(func);
+    ARM_COMPUTE_LOG_GRAPH_INFO(
+        "Instantiated " << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                        << " Data Type: " << output->info()->data_type() << " Shape: " << output->info()->tensor_shape()
+                        << " Num Inputs: " << inputs.size() << " Axis: " << concat_axis << qss.str() << std::endl);
+
+    return func;
 }
 
 /** Create a backend convolution layer function
  *
  * @tparam ConvolutionLayerFunctions Backend convolution functions
- * @tparam TargetInfo              Target-specific information
+ * @tparam TargetInfo                Target-specific information
  *
  * @param[in] node Node to create the backend function for
  * @param[in] ctx  Graph context
@@ -440,7 +438,7 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(is_quantized)
+    if (is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
@@ -456,56 +454,51 @@ std::unique_ptr<IFunction> create_convolution_layer(ConvolutionLayerNode &node,
     std::unique_ptr<IFunction>      func;
     std::string                     func_name;
 
-    if(conv_algorithm == ConvolutionMethod::Winograd)
+    if (conv_algorithm == ConvolutionMethod::Winograd)
     {
         ARM_COMPUTE_ERROR_ON_MSG(num_groups != 1, "WinogradConvolutionLayer does not support grouping!");
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
-                                        std::string("WinogradConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info, fused_act, fast_math);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::WinogradConvolutionLayer>(
+                std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act,
+                fast_math);
     }
-    else if(conv_algorithm == ConvolutionMethod::Direct)
+    else if (conv_algorithm == ConvolutionMethod::Direct)
     {
         ARM_COMPUTE_ERROR_ON_MSG(num_groups != 1, "DirectConvolutionLayer does not support grouping!");
         std::tie(func, func_name) = create_named_function<typename ConvolutionLayerFunctions::DirectConvolutionLayer>(
-                                        std::string("DirectConvolutionLayer"),
-                                        input, weights, biases, output, conv_info, fused_act);
+            std::string("DirectConvolutionLayer"), input, weights, biases, output, conv_info, fused_act);
     }
-    else if(conv_algorithm == ConvolutionMethod::GEMM)
+    else if (conv_algorithm == ConvolutionMethod::GEMM)
     {
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
-                                        std::string("GEMMConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info,
-                                        WeightsInfo(), Size2D(1U, 1U), fused_act, num_groups);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::GEMMConvolutionLayer>(
+                std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(),
+                Size2D(1U, 1U), fused_act, num_groups);
     }
     else
     {
-        std::tie(func, func_name) = create_named_memory_managed_function<typename ConvolutionLayerFunctions::GenericConvolutionLayer>(
-                                        std::string("GenericConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info,
-                                        WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups);
+        std::tie(func, func_name) =
+            create_named_memory_managed_function<typename ConvolutionLayerFunctions::GenericConvolutionLayer>(
+                std::string("GenericConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(),
+                Size2D(1U, 1U), fused_act, fast_math, num_groups);
     }
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Groups: " << num_groups
+                               << node.name() << " Type: " << func_name << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Groups: " << num_groups
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return RETURN_UNIQUE_PTR(func);
+                               << " Output shape: " << output->info()->tensor_shape() << qss.str()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
 }
 
 /** Create a backend deconvolution layer function
@@ -536,19 +529,14 @@ std::unique_ptr<IFunction> create_deconvolution_layer(DeconvolutionLayerNode &no
     std::unique_ptr<IFunction>      func;
 
     std::tie(func, std::ignore) = create_named_memory_managed_function<DeconvolutionLayerFunction>(
-                                      std::string(), mm,
-                                      input, weights, biases, output, deconv_info);
+        std::string(), mm, input, weights, biases, output, deconv_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Weights shape: " << weights->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
     return func;
 }
 
@@ -574,7 +562,7 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
-    if(is_quantized)
+    if (is_quantized)
     {
         biases->info()->set_data_type(DataType::S32);
     }
@@ -587,31 +575,61 @@ std::unique_ptr<IFunction> create_depthwise_convolution_layer(DepthwiseConvoluti
     std::unique_ptr<IFunction> func;
     std::string                func_name;
 
-    std::tie(func, func_name) = create_named_function<DepthwiseConvolutionLayer>(
-                                    std::string("DepthwiseConvolutionLayer"),
-                                    input, weights, biases, output, conv_info, depth_multiplier, fused_act);
+    std::tie(func, func_name) =
+        create_named_function<DepthwiseConvolutionLayer>(std::string("DepthwiseConvolutionLayer"), input, weights,
+                                                         biases, output, conv_info, depth_multiplier, fused_act);
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << func_name
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
+                               << node.name() << " Type: " << func_name << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " Depth multiplier: " << depth_multiplier
-                               << qss.str()
-                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
-                               << std::endl);
-    return RETURN_UNIQUE_PTR(func);
+                               << " Depth multiplier: " << depth_multiplier << qss.str()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "") << std::endl);
+    return func;
+}
+
+/** Create a backend depth to space layer function
+ *
+ * @tparam DepthToSpaceLayerNode Function Backend depth to space function
+ * @tparam TargetInfo            Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend depth to space layer function
+ */
+template <typename DepthToSpaceLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_depth_to_space_layer(DepthToSpaceLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input  = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output = get_backing_tensor<TargetInfo>(node.output(0));
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = std::make_unique<DepthToSpaceLayerFunction>();
+    func->configure(input, output, node.block_shape());
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Block Size: " << node.block_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
+
+    return func;
 }
 
 /** Create a backend dequantize layer function
@@ -636,21 +654,17 @@ std::unique_ptr<IFunction> create_dequantization_layer(DequantizationLayerNode &
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<DequantizationLayerFunction>();
+    auto func = std::make_unique<DequantizationLayerFunction>();
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Input quantization info: " << output->info()->quantization_info()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Input quantization info: " << output->info()->quantization_info()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 /** Create a backend detection output layer function
  *
@@ -679,23 +693,19 @@ std::unique_ptr<IFunction> create_detection_output_layer(DetectionOutputLayerNod
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<DetectionOutputLayerFunction>();
+    auto func = std::make_unique<DetectionOutputLayerFunction>();
     func->configure(input0, input1, input2, output, detect_info);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " DetectionOutputLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionOutputLayer info: " << detect_info << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend detection post process layer function
@@ -731,26 +741,22 @@ std::unique_ptr<IFunction> create_detection_post_process_layer(DetectionPostProc
     ARM_COMPUTE_ERROR_ON(output3 == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<DetectionPostProcessLayerFunction>();
+    auto func = std::make_unique<DetectionPostProcessLayerFunction>();
     func->configure(input0, input1, input2, output0, output1, output2, output3, detect_info);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output0 shape: " << output0->info()->tensor_shape()
                                << " Output1 shape: " << output1->info()->tensor_shape()
                                << " Output2 shape: " << output2->info()->tensor_shape()
                                << " Output3 shape: " << output3->info()->tensor_shape()
-                               << " DetectionPostProcessLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionPostProcessLayer info: " << detect_info << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend element-wise operation layer function
@@ -780,23 +786,31 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
 
     std::unique_ptr<IFunction> func = nullptr;
     std::string                func_name;
-    if(eltwise_op == EltwiseOperation::Add)
+    if (eltwise_op == EltwiseOperation::Add)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Addition>(
-                                        std::string("ArithmeticAddition"),
-                                        input1, input2, output, convert_policy, act_info);
+            std::string("ArithmeticAddition"), input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Sub)
+    else if (eltwise_op == EltwiseOperation::Sub)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Subtraction>(
-                                        std::string("ArithmeticSubtraction"),
-                                        input1, input2, output, convert_policy, act_info);
+            std::string("ArithmeticSubtraction"), input1, input2, output, convert_policy, act_info);
     }
-    else if(eltwise_op == EltwiseOperation::Mul)
+    else if (eltwise_op == EltwiseOperation::Mul)
     {
         std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Multiplication>(
-                                        std::string("PixelWiseMultiplication"),
-                                        input1, input2, output, 1.f, convert_policy, node.rounding_policy(), act_info);
+            std::string("PixelWiseMultiplication"), input1, input2, output, 1.f, convert_policy, node.rounding_policy(),
+            act_info);
+    }
+    else if (eltwise_op == EltwiseOperation::Max)
+    {
+        std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Maximum>(
+            std::string("ElementwiseMaximum"), input1, input2, output, act_info);
+    }
+    else if (eltwise_op == EltwiseOperation::Div)
+    {
+        std::tie(func, func_name) = create_named_function<typename EltwiseFunctions::Division>(
+            std::string("ArithmeticDivision"), input1, input2, output, act_info);
     }
     else
     {
@@ -804,16 +818,55 @@ std::unique_ptr<IFunction> create_eltwise_layer(EltwiseLayerNode &node)
     }
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Operation: " << func_name
-                               << " Data Type: " << input1->info()->data_type()
-                               << " Shape: " << input1->info()->tensor_shape()
-                               << std::endl);
-
-    return RETURN_UNIQUE_PTR(func);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType << " Operation: " << func_name
+                                               << " Data Type: " << input1->info()->data_type()
+                                               << " Shape: " << input1->info()->tensor_shape() << std::endl);
+
+    return func;
+}
+
+/** Create a backend unary element-wise operation layer function
+ *
+ * @tparam UnaryEltwiseFunctions Backend unary element-wise function
+ * @tparam TargetInfo       Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend unary element-wise operation layer function
+ */
+template <typename UnaryEltwiseFunctions, typename TargetInfo>
+std::unique_ptr<IFunction> create_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input      = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output     = get_backing_tensor<TargetInfo>(node.output(0));
+    const UnaryEltwiseOperation      eltwise_op = node.eltwise_descriptor().op;
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    std::unique_ptr<IFunction> func = nullptr;
+    std::string                func_name;
+    if (eltwise_op == UnaryEltwiseOperation::Exp)
+    {
+        std::tie(func, func_name) =
+            create_named_function<typename UnaryEltwiseFunctions::Exp>(std::string("Exp"), input, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported unary element-wise operation!");
+    }
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType << " Operation: " << func_name
+                                               << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << std::endl);
+
+    return func;
 }
 
 /** Create a backend flatten layer function
@@ -838,20 +891,16 @@ std::unique_ptr<IFunction> create_flatten_layer(FlattenLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<FlattenLayerFunction>();
+    auto func = std::make_unique<FlattenLayerFunction>();
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend fully connected layer function
@@ -874,7 +923,8 @@ std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode
     typename TargetInfo::TensorType *weights = get_backing_tensor<TargetInfo>(node.input(1));
     typename TargetInfo::TensorType *biases  = get_backing_tensor<TargetInfo>(node.input(2));
     typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
-    const FullyConnectedLayerInfo    fc_info = node.info();
+    FullyConnectedLayerInfo          fc_info = node.info();
+    fc_info.enable_fast_math                 = (node.fast_math_hint() == FastMathHint::Enabled);
 
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_ERROR_ON(weights == nullptr);
@@ -883,31 +933,26 @@ std::unique_ptr<IFunction> create_fully_connected_layer(FullyConnectedLayerNode
     // Create and configure function
     auto wm   = get_weights_manager(ctx, TargetInfo::TargetType);
     auto mm   = get_memory_manager(ctx, TargetInfo::TargetType);
-    auto func = support::cpp14::make_unique<FullyConnectedLayerFunction>(mm, wm.get());
+    auto func = std::make_unique<FullyConnectedLayerFunction>(mm, wm.get());
     func->configure(input, weights, biases, output, fc_info);
 
     const bool is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     // Log info
     std::ostringstream qss;
-    if(is_quantized)
+    if (is_quantized)
     {
         qss << " Input QuantInfo: " << input->info()->quantization_info()
             << " Weights QuantInfo: " << weights->info()->quantization_info()
             << " Output QuantInfo: " << output->info()->quantization_info();
     }
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << qss.str()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Weights shape: " << weights->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << qss.str() << " Input shape: " << input->info()->tensor_shape()
+                                               << " Weights shape: " << weights->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend generate proposals layer function
@@ -941,22 +986,59 @@ std::unique_ptr<IFunction> create_generate_proposals_layer(GenerateProposalsLaye
     ARM_COMPUTE_ERROR_ON(scores_out == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<GenerateProposalsLayerFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
+    auto func = std::make_unique<GenerateProposalsLayerFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
     func->configure(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.type()
-                               << " Target " << TargetInfo::TargetType
-                               << " Data Type: " << scores->info()->data_type()
-                               << " Scores shape: " << scores->info()->tensor_shape()
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.type() << " Target " << TargetInfo::TargetType << " Data Type: "
+                               << scores->info()->data_type() << " Scores shape: " << scores->info()->tensor_shape()
                                << " Deltas shape: " << deltas->info()->tensor_shape()
                                << " Anchors shape: " << anchors->info()->tensor_shape()
                                << " Proposals shape: " << proposals->info()->tensor_shape()
                                << " Num valid proposals shape: " << num_valid_proposals->info()->tensor_shape()
-                               << " Scores Out shape: " << scores_out->info()->tensor_shape()
-                               << std::endl);
+                               << " Scores Out shape: " << scores_out->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return std::move(func);
+}
+
+/** Create a backend l2 normalization layer function
+ *
+ * @tparam NormalizationLayerFunction Backend normalization function
+ * @tparam TargetInfo                 Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
+ *
+ * @return Backend normalization layer function
+ */
+template <typename L2NormalizeLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_l2_normalize_layer(L2NormalizeLayerNode &node, GraphContext &ctx)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
+    int                              axis    = node.axis();
+    float                            epsilon = node.epsilon();
+
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto mm   = get_memory_manager(ctx, TargetInfo::TargetType);
+    auto func = std::make_unique<L2NormalizeLayerFunction>(mm);
+    func->configure(input, output, axis, epsilon);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Axis: " << axis << " Epsilon: " << epsilon << std::endl);
+
+    return func;
 }
 
 /** Create a backend normalization layer function
@@ -984,21 +1066,17 @@ std::unique_ptr<IFunction> create_normalization_layer(NormalizationLayerNode &no
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<NormalizationLayerFunction>();
+    auto func = std::make_unique<NormalizationLayerFunction>();
     func->configure(input, output, norm_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Normalization info: " << norm_info.type()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Normalization info: " << norm_info.type() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return std::move(func);
 }
 
 /** Create a backend normalize planar YUV layer function
@@ -1026,19 +1104,15 @@ std::unique_ptr<IFunction> create_normalize_planar_yuv_layer(NormalizePlanarYUVL
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<NormalizePlanarYUVLayerFunction>();
+    auto func = std::make_unique<NormalizePlanarYUVLayerFunction>();
     func->configure(input, output, mean, std);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Shape: " << input->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return std::move(func);
 }
 
 /** Create a backend pad layer function
@@ -1064,20 +1138,16 @@ std::unique_ptr<IFunction> create_pad_layer(PadLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<PadLayerFunction>();
+    auto func = std::make_unique<PadLayerFunction>();
     func->configure(input, output, padding, pad_value);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend permute layer function
@@ -1102,21 +1172,17 @@ std::unique_ptr<IFunction> create_permute_layer(PermuteLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<PermuteLayerFunction>();
+    auto func = std::make_unique<PermuteLayerFunction>();
     func->configure(input, output, perm);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Permutation vector: " << perm
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Permutation vector: " << perm << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend pooling layer function
@@ -1141,21 +1207,17 @@ std::unique_ptr<IFunction> create_pooling_layer(PoolingLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<PoolingLayerFunction>();
+    auto func = std::make_unique<PoolingLayerFunction>();
     func->configure(input, output, pool_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Pooling info: " << pool_info.pool_type
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Pooling info: " << pool_info.pool_type << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend PRelu layer function
@@ -1180,20 +1242,16 @@ std::unique_ptr<IFunction> create_prelu_layer(PReluLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<PReluFunction>();
+    auto func = std::make_unique<PReluFunction>();
     func->configure(input, alpha, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend print layer function
@@ -1214,13 +1272,9 @@ std::unique_ptr<IFunction> create_print_layer(PrintLayerNode &node)
     ARM_COMPUTE_UNUSED(input);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape() << std::endl);
 
     return nullptr;
 }
@@ -1249,22 +1303,18 @@ std::unique_ptr<IFunction> create_priorbox_layer(PriorBoxLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<PriorBoxLayerFunction>();
+    auto func = std::make_unique<PriorBoxLayerFunction>();
     func->configure(input0, input1, output, prior_info);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " PriorBoxLayer info: " << prior_info
-                               << std::endl);
+                               << " PriorBoxLayer info: " << prior_info << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend quantization layer function
@@ -1288,20 +1338,55 @@ std::unique_ptr<IFunction> create_quantization_layer(QuantizationLayerNode &node
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<QuantizationLayerFunction>();
+    auto func = std::make_unique<QuantizationLayerFunction>();
     func->configure(input, output);
 
     // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
+
+    return func;
+}
+
+/** Create a backend reduction operation layer function
+ *
+ * @tparam ReductionOperationFunction Backend reduction operation function
+ * @tparam TargetInfo                 Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
+ *
+ * @return Backend reduction sum layer function
+ */
+template <typename ReductionOperationFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_reduction_operation_layer(ReductionLayerNode &node, GraphContext &ctx)
+{
+    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input     = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output    = get_backing_tensor<TargetInfo>(node.output(0));
+    ReductionOperation               op        = node.op();
+    int                              axis      = node.axis();
+    bool                             keep_dims = node.keep_dims();
+    ARM_COMPUTE_ERROR_ON(input == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = std::make_unique<ReductionOperationFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
+    func->configure(input, output, axis, op, keep_dims);
+
+    // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
+                               << node.name() << " Type: " << node.type() << " Target: " << TargetInfo::TargetType
                                << " Data Type: " << input->info()->data_type()
                                << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+                               << " Output shape: " << output->info()->tensor_shape() << " Operation: " << op
+                               << " Axis: " << axis << " Keep dimensions:" << keep_dims << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend reorg layer function
@@ -1325,20 +1410,16 @@ std::unique_ptr<IFunction> create_reorg_layer(ReorgLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<ReorgLayerFunction>();
+    auto func = std::make_unique<ReorgLayerFunction>();
     func->configure(input, output, node.stride());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend reshape layer function
@@ -1362,20 +1443,16 @@ std::unique_ptr<IFunction> create_reshape_layer(ReshapeLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<ReshapeLayerFunction>();
+    auto func = std::make_unique<ReshapeLayerFunction>();
     func->configure(input, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend resize layer function
@@ -1400,21 +1477,18 @@ std::unique_ptr<IFunction> create_resize_layer(ResizeLayerNode &node)
     const InterpolationPolicy policy = node.policy();
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<ResizeLayerFunction>();
-    func->configure(input, output, policy, BorderMode::CONSTANT);
+    auto func = std::make_unique<ResizeLayerFunction>();
+    func->configure(input, output,
+                    ScaleKernelInfo{policy, BorderMode::CONSTANT, PixelValue(), SamplingPolicy::CENTER, false, false});
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Interpolation: " << policy
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Interpolation: " << policy << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend ROI align layer function
@@ -1442,24 +1516,20 @@ std::unique_ptr<IFunction> create_roi_align_layer(ROIAlignLayerNode &node)
     const ROIPoolingLayerInfo pool_info = node.pooling_info();
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<ROIAlignLayerFunction>();
+    auto func = std::make_unique<ROIAlignLayerFunction>();
 
     func->configure(input, rois, output, pool_info);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " ROIs shape: " << rois->info()->tensor_shape()
-                               << " ROIPooling width: " << pool_info.pooled_width()
-                               << " ROIPooling height: " << pool_info.pooled_height()
-                               << std::endl);
-
-    return RETURN_UNIQUE_PTR(func);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " ROIs shape: " << rois->info()->tensor_shape()
+                                               << " ROIPooling width: " << pool_info.pooled_width()
+                                               << " ROIPooling height: " << pool_info.pooled_height() << std::endl);
+
+    return std::move(func);
 }
 
 /** Create a backend slice layer function
@@ -1483,20 +1553,16 @@ std::unique_ptr<IFunction> create_slice_layer(SliceLayerNode &node)
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<SliceLayerFunction>();
+    auto func = std::make_unique<SliceLayerFunction>();
     func->configure(input, output, node.starts(), node.ends());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend softmax layer function
@@ -1522,20 +1588,16 @@ std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphCon
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<SoftmaxLayerFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
+    auto func = std::make_unique<SoftmaxLayerFunction>(get_memory_manager(ctx, TargetInfo::TargetType));
     func->configure(input, output, beta);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 
 /** Create a backend layer stack function
@@ -1550,12 +1612,13 @@ std::unique_ptr<IFunction> create_softmax_layer(SoftmaxLayerNode &node, GraphCon
 template <typename StackLayerFunction, typename TargetInfo>
 std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Stack node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Creating Stack node with ID : " << node.id() << " and Name: " << node.name()
+                                                                   << std::endl);
     ARM_COMPUTE_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
     std::vector<typename TargetInfo::TensorType *> inputs;
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         inputs.push_back(get_backing_tensor<TargetInfo>(node.input(i)));
     }
@@ -1563,113 +1626,60 @@ std::unique_ptr<arm_compute::IFunction> create_stack_layer(StackLayerNode &node)
     const int                        axis   = node.axis();
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<StackLayerFunction>();
+    auto func = std::make_unique<StackLayerFunction>();
     func->configure(inputs, axis, output);
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << output->info()->data_type()
-                               << " Inputs shape: " << inputs[0]->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Num Inputs: " << inputs.size()
-                               << " Axis: " << axis
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type()
+                                               << " Target: " << TargetInfo::TargetType
+                                               << " Data Type: " << output->info()->data_type()
+                                               << " Inputs shape: " << inputs[0]->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape()
+                                               << " Num Inputs: " << inputs.size() << " Axis: " << axis << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
-/** Create a backend Upsample layer function
- *
- * @tparam UpsampleLayerFunction Backend Upsample function
- * @tparam TargetInfo            Target-specific information
- *
- * @param[in] node Node to create the backend function for
- * @param[in] ctx  Graph context
- *
- * @return Backend Upsample layer function
- */
-template <typename UpsampleLayerFunction, typename TargetInfo>
-std::unique_ptr<IFunction> create_upsample_layer(UpsampleLayerNode &node, GraphContext &ctx)
-{
-    ARM_COMPUTE_UNUSED(ctx);
-    validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
-
-    // Extract IO and info
-    typename TargetInfo::TensorType *input             = get_backing_tensor<TargetInfo>(node.input(0));
-    typename TargetInfo::TensorType *output            = get_backing_tensor<TargetInfo>(node.output(0));
-    const Size2D                     info              = node.info();
-    const InterpolationPolicy        upsampling_policy = node.upsampling_policy();
-    ARM_COMPUTE_ERROR_ON(upsampling_policy != InterpolationPolicy::NEAREST_NEIGHBOR);
-    ARM_COMPUTE_ERROR_ON(info.x() != 2 || info.y() != 2);
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
 
-    // Create and configure function
-    auto func = support::cpp14::make_unique<UpsampleLayerFunction>();
-    func->configure(input, output, info, upsampling_policy);
-
-    // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Strides: " << info
-                               << " Upsampling policy: " << upsampling_policy
-                               << std::endl);
-
-    return RETURN_UNIQUE_PTR(func);
-}
-/** Create a backend YOLO layer function
+/** Create a backend slice layer function
  *
- * @tparam YoloLayerFunction Backend YOLO function
- * @tparam TargetInfo        Target-specific information
+ * @tparam StridedSliceLayerFunction Backend strided slice function
+ * @tparam TargetInfo                Target-specific information
  *
  * @param[in] node Node to create the backend function for
- * @param[in] ctx  Graph context
  *
- * @return Backend YOLO layer function
+ * @return Backend strided slice layer function
  */
-template <typename YOLOlayerFunction, typename TargetInfo>
-std::unique_ptr<IFunction> create_yolo_layer(YOLOLayerNode &node, GraphContext &ctx)
+template <typename StridedSliceLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_strided_slice_layer(StridedSliceLayerNode &node)
 {
-    ARM_COMPUTE_UNUSED(ctx);
     validate_node<TargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
 
     // Extract IO and info
-    typename TargetInfo::TensorType *input       = get_backing_tensor<TargetInfo>(node.input(0));
-    typename TargetInfo::TensorType *output      = get_backing_tensor<TargetInfo>(node.output(0));
-    const ActivationLayerInfo        act_info    = node.activation_info();
-    const int32_t                    num_classes = node.num_classes();
-    ARM_COMPUTE_ERROR_ON(num_classes <= 0);
+    typename TargetInfo::TensorType *input   = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *output  = get_backing_tensor<TargetInfo>(node.output(0));
+    Coordinates                      starts  = node.starts();
+    Coordinates                      ends    = node.ends();
+    BiStrides                        strides = node.strides();
+    StridedSliceLayerInfo            info    = node.strided_slice_info();
+
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_ERROR_ON(output == nullptr);
 
     // Create and configure function
-    auto func = support::cpp14::make_unique<YOLOlayerFunction>();
-    func->configure(input, output, act_info, num_classes);
+    auto func = std::make_unique<StridedSliceLayerFunction>();
+    func->configure(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(), info.shrink_axis_mask());
 
     // Log info
-    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << TargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Activation function: " << act_info.activation()
-                               << " Num classes: " << num_classes
-                               << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated " << node.name() << " Type: " << node.type() << " Target: "
+                                               << TargetInfo::TargetType << " Data Type: " << input->info()->data_type()
+                                               << " Input shape: " << input->info()->tensor_shape()
+                                               << " Output shape: " << output->info()->tensor_shape() << std::endl);
 
-    return RETURN_UNIQUE_PTR(func);
+    return func;
 }
 } // namespace detail
 } // namespace backends
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_FUNCTION_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUNCTIONHELPERS_H
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
index 895906da13..27e21cbc7e 100644
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,11 +22,12 @@
  * SOFTWARE.
  */
 
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_FUNCTION_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_FUNCTION_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUSEDCONVOLUTIONBATCHNORMALIZATIONFUNCTION_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUSEDCONVOLUTIONBATCHNORMALIZATIONFUNCTION_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 namespace arm_compute
 {
@@ -69,15 +70,19 @@ public:
      * @param[in]  fused_act  Activation layer information in case of a fused activation.
      *
      */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int num_groups, bool fast_math, ActivationLayerInfo const &fused_act)
+    void configure(TensorType                *input,
+                   TensorType                *weights,
+                   TensorType                *bias,
+                   TensorType                *output,
+                   const TensorType          *mean,
+                   const TensorType          *var,
+                   const TensorType          *beta,
+                   const TensorType          *gamma,
+                   float                      epsilon,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               num_groups,
+                   bool                       fast_math,
+                   ActivationLayerInfo const &fused_act)
     {
         // We don't run any validate, as we assume that the layers have been already validated
         const bool        has_bias = (bias != nullptr);
@@ -85,7 +90,7 @@ public:
 
         // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
         // as batch normalization might end up with a bias != 0
-        if(has_bias)
+        if (has_bias)
         {
             _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon);
             bias_to_use = bias;
@@ -96,9 +101,10 @@ public:
             bias_to_use = &_fused_bias;
         }
 
-        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act, fast_math, num_groups);
+        _conv_layer.configure(input, weights, bias_to_use, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act,
+                              fast_math, num_groups);
 
-        if(!has_bias)
+        if (!has_bias)
         {
             _fused_bias.allocator()->allocate();
         }
@@ -113,7 +119,7 @@ public:
 
     void prepare()
     {
-        if(!_is_prepared)
+        if (!_is_prepared)
         {
             _fused_batch_norm_layer.run();
             _is_prepared = true;
@@ -130,4 +136,4 @@ private:
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_FUSED_CONVOLUTION_BATCH_NORMAZLIZATION_FUNCTION_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_FUSEDCONVOLUTIONBATCHNORMALIZATIONFUNCTION_H
diff --git a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
index 37f3eab2e3..07a2cdd8b8 100644
--- a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,15 +67,18 @@ public:
      * @param[in]  fused_act        Activation layer information in case of a fused activation.
      *
      */
-    void configure(TensorType       *input,
-                   TensorType       *weights,
-                   TensorType       *bias,
-                   TensorType       *output,
-                   const TensorType *mean,
-                   const TensorType *var,
-                   const TensorType *beta,
-                   const TensorType *gamma,
-                   float epsilon, const PadStrideInfo &conv_info, unsigned int depth_multiplier, ActivationLayerInfo const &fused_act)
+    void configure(TensorType                *input,
+                   TensorType                *weights,
+                   TensorType                *bias,
+                   TensorType                *output,
+                   const TensorType          *mean,
+                   const TensorType          *var,
+                   const TensorType          *beta,
+                   const TensorType          *gamma,
+                   float                      epsilon,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier,
+                   ActivationLayerInfo const &fused_act)
     {
         // We don't run any validate, as we assume that the layers have been already validated
         const bool        has_bias = (bias != nullptr);
@@ -83,20 +86,23 @@ public:
 
         // We check if the layer has a bias. If yes, use it in-place. If not, we need to create one
         // as batch normalization might end up with a bias != 0
-        if(has_bias)
+        if (has_bias)
         {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon, FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, nullptr, bias, beta, gamma, epsilon,
+                                              FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
             bias_to_use = bias;
         }
         else
         {
-            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon, FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
+            _fused_batch_norm_layer.configure(weights, mean, var, nullptr, &_fused_bias, nullptr, beta, gamma, epsilon,
+                                              FuseBatchNormalizationType::DEPTHWISECONVOLUTION);
             bias_to_use = &_fused_bias;
         }
 
-        _depth_conv_layer.configure(input, weights, bias_to_use, output, conv_info, depth_multiplier, fused_act.enabled() ? fused_act : ActivationLayerInfo());
+        _depth_conv_layer.configure(input, weights, bias_to_use, output, conv_info, depth_multiplier,
+                                    fused_act.enabled() ? fused_act : ActivationLayerInfo());
 
-        if(!has_bias)
+        if (!has_bias)
         {
             _fused_bias.allocator()->allocate();
         }
@@ -111,7 +117,7 @@ public:
 
     void prepare()
     {
-        if(!_is_prepared)
+        if (!_is_prepared)
         {
             _fused_batch_norm_layer.run();
             _is_prepared = true;
diff --git a/arm_compute/graph/backends/GLES/GCDeviceBackend.h b/arm_compute/graph/backends/GLES/GCDeviceBackend.h
deleted file mode 100644
index a6c0bfe7ed..0000000000
--- a/arm_compute/graph/backends/GLES/GCDeviceBackend.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_GCDEVICEBACKEND_H
-#define ARM_COMPUTE_GRAPH_GCDEVICEBACKEND_H
-
-#include "arm_compute/graph/IDeviceBackend.h"
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** GLES Compute device backend */
-class GCDeviceBackend final : public IDeviceBackend
-{
-public:
-    /** Default Constructor */
-    GCDeviceBackend();
-
-    // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
-    bool                           is_backend_supported() override;
-    IAllocator                    *backend_allocator() override;
-    std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
-    std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
-
-private:
-    bool              _initialized; /**< Flag that specifies if the backend has been default initialized */
-    GCBufferAllocator _allocator;   /**< GLES buffer affinity allocator */
-};
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
-#endif //ARM_COMPUTE_GRAPH_GCDEVICEBACKEND_H
diff --git a/arm_compute/graph/backends/GLES/GCTensorHandle.h b/arm_compute/graph/backends/GLES/GCTensorHandle.h
deleted file mode 100644
index 119731df58..0000000000
--- a/arm_compute/graph/backends/GLES/GCTensorHandle.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GRAPH_GCTENSORHANDLE_H
-#define ARM_COMPUTE_GRAPH_GCTENSORHANDLE_H
-
-#include "arm_compute/graph/ITensorHandle.h"
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-
-namespace arm_compute
-{
-namespace graph
-{
-namespace backends
-{
-/** GLES compute tensor handle interface object **/
-class GCTensorHandle final : public ITensorHandle
-{
-public:
-    /** Default Constructor
-     *
-     * @param[in] info Tensor metadata
-     */
-    GCTensorHandle(const ITensorInfo &info);
-    /** Destructor: free the tensor's memory */
-    ~GCTensorHandle() = default;
-    /** Allow instances of this class to be move constructed */
-    GCTensorHandle(GCTensorHandle &&) = default;
-    /** Allow instances of this class to be moved */
-    GCTensorHandle &operator=(GCTensorHandle &&) = default;
-
-    // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
-    void                        unmap() override;
-    void                        release_if_unused() override;
-    arm_compute::ITensor       &tensor() override;
-    const arm_compute::ITensor &tensor() const override;
-    ITensorHandle              *parent_handle() override;
-    bool                        is_subtensor() const override;
-    Target                      target() const override;
-
-private:
-    arm_compute::GCTensor _tensor; /**< Backend Tensor */
-};
-} // namespace backends
-} // namespace graph
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_GCTENSORHANDLE_H */
diff --git a/arm_compute/graph/backends/NEON/NEDeviceBackend.h b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
index 87acc5551e..cd817a20d8 100644
--- a/arm_compute/graph/backends/NEON/NEDeviceBackend.h
+++ b/arm_compute/graph/backends/NEON/NEDeviceBackend.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NEDEVICEBACKEND_H
 
 #include "arm_compute/graph/IDeviceBackend.h"
-
 #include "arm_compute/runtime/Allocator.h"
 
 namespace arm_compute
@@ -34,27 +33,29 @@ namespace graph
 {
 namespace backends
 {
-/** NEON device backend */
+/** CPU device backend */
 class NEDeviceBackend final : public IDeviceBackend
 {
 public:
     NEDeviceBackend();
 
     // Inherited overridden methods
-    void initialize_backend() override;
-    void setup_backend_context(GraphContext &ctx) override;
-    void release_backend_context(GraphContext &ctx) override;
+    void                           initialize_backend() override;
+    void                           setup_backend_context(GraphContext &ctx) override;
+    void                           release_backend_context(GraphContext &ctx) override;
     bool                           is_backend_supported() override;
     IAllocator                    *backend_allocator() override;
     std::unique_ptr<ITensorHandle> create_tensor(const Tensor &tensor) override;
-    std::unique_ptr<ITensorHandle> create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
-    std::unique_ptr<arm_compute::IFunction> configure_node(INode &node, GraphContext &ctx) override;
-    Status validate_node(INode &node) override;
-    std::shared_ptr<arm_compute::IMemoryManager> create_memory_manager(MemoryManagerAffinity affinity) override;
+    std::unique_ptr<ITensorHandle>
+    create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent) override;
+    std::unique_ptr<arm_compute::IFunction>       configure_node(INode &node, GraphContext &ctx) override;
+    Status                                        validate_node(INode &node) override;
+    std::shared_ptr<arm_compute::IMemoryManager>  create_memory_manager(MemoryManagerAffinity affinity) override;
     std::shared_ptr<arm_compute::IWeightsManager> create_weights_manager() override;
+    void                                          sync() override;
 
 private:
-    Allocator _allocator; /**< NEON backend allocator */
+    Allocator _allocator; /**< Backend allocator */
 };
 } // namespace backends
 } // namespace graph
diff --git a/arm_compute/graph/backends/NEON/NEFunctionFactory.h b/arm_compute/graph/backends/NEON/NEFunctionFactory.h
index b05b9f0c8b..6365b71f32 100644
--- a/arm_compute/graph/backends/NEON/NEFunctionFactory.h
+++ b/arm_compute/graph/backends/NEON/NEFunctionFactory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@ class GraphContext;
 
 namespace backends
 {
-/** Factory for generating NEON backend functions **/
+/** Factory for generating CPU backend functions **/
 class NEFunctionFactory final
 {
 public:
diff --git a/arm_compute/graph/backends/NEON/NENodeValidator.h b/arm_compute/graph/backends/NEON/NENodeValidator.h
index 578f405575..5d23b8bb06 100644
--- a/arm_compute/graph/backends/NEON/NENodeValidator.h
+++ b/arm_compute/graph/backends/NEON/NENodeValidator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/backends/NEON/NESubTensorHandle.h b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
index 1dbc0530e5..3619f4ed1b 100644
--- a/arm_compute/graph/backends/NEON/NESubTensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NESubTensorHandle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NESUBTENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/SubTensor.h"
 
 namespace arm_compute
@@ -34,7 +33,7 @@ namespace graph
 {
 namespace backends
 {
-/** NEON Sub-Tensor handle interface object **/
+/** CPU Sub-Tensor handle interface object **/
 class NESubTensorHandle final : public ITensorHandle
 {
 public:
@@ -45,7 +44,10 @@ public:
      * @param[in] coords        Starting coordinates
      * @param[in] extend_parent Extends parent shape if true
      */
-    NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent = false);
+    NESubTensorHandle(ITensorHandle     *parent_handle,
+                      const TensorShape &shape,
+                      const Coordinates &coords,
+                      bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~NESubTensorHandle() = default;
     /** Allow instances of this class to be move constructed */
@@ -58,10 +60,10 @@ public:
     NESubTensorHandle &operator=(const NESubTensorHandle &) = delete;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/NEON/NETensorHandle.h b/arm_compute/graph/backends/NEON/NETensorHandle.h
index 0f1b748cba..1df90822ba 100644
--- a/arm_compute/graph/backends/NEON/NETensorHandle.h
+++ b/arm_compute/graph/backends/NEON/NETensorHandle.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_GRAPH_NETENSORHANDLE_H
 
 #include "arm_compute/graph/ITensorHandle.h"
-
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
@@ -34,7 +33,7 @@ namespace graph
 {
 namespace backends
 {
-/** NEON Tensor handle interface object **/
+/** CPU Tensor handle interface object **/
 class NETensorHandle final : public ITensorHandle
 {
 public:
@@ -51,10 +50,10 @@ public:
     NETensorHandle &operator=(NETensorHandle &&) = default;
 
     // Inherited overridden methods
-    void allocate() override;
-    void free() override;
-    void manage(IMemoryGroup *mg) override;
-    void map(bool blocking) override;
+    void                        allocate() override;
+    void                        free() override;
+    void                        manage(IMemoryGroup *mg) override;
+    void                        map(bool blocking) override;
     void                        unmap() override;
     void                        release_if_unused() override;
     arm_compute::ITensor       &tensor() override;
diff --git a/arm_compute/graph/backends/Utils.h b/arm_compute/graph/backends/Utils.h
index 4893340b4c..5f4e66c207 100644
--- a/arm_compute/graph/backends/Utils.h
+++ b/arm_compute/graph/backends/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,9 +42,10 @@ namespace backends
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename... ParameterType>
-std::pair<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name, ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_function(FunctionNameType name,
+                                                                                            ParameterType... args)
 {
-    auto f = arm_compute::support::cpp14::make_unique<FunctionType>();
+    auto f = std::make_unique<FunctionType>();
     f->configure(std::forward<ParameterType>(args)...);
     return std::make_pair(std::move(f), name);
 }
@@ -58,11 +59,10 @@ std::pair<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_name
  * @return  A configured backend function
  */
 template <typename FunctionType, typename FunctionNameType, typename MemoryManagerType, typename... ParameterType>
-std::pair<std::unique_ptr<arm_compute::IFunction>, FunctionNameType> create_named_memory_managed_function(FunctionNameType name,
-                                                                                                          MemoryManagerType mm,
-                                                                                                          ParameterType... args)
+std::tuple<std::unique_ptr<arm_compute::IFunction>, FunctionNameType>
+create_named_memory_managed_function(FunctionNameType name, MemoryManagerType mm, ParameterType... args)
 {
-    auto f = arm_compute::support::cpp14::make_unique<FunctionType>(mm);
+    auto f = std::make_unique<FunctionType>(mm);
     f->configure(std::forward<ParameterType>(args)...);
     return std::make_pair(std::move(f), name);
 }
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 673caf9eac..0e102942a7 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,17 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
-#define ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H
-
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
-#include "arm_compute/graph/Types.h"
-#include "arm_compute/graph/nodes/Nodes.h"
+#ifndef ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
+#define ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/graph/Types.h"
 
 namespace arm_compute
 {
@@ -52,6 +51,30 @@ inline arm_compute::ITensorInfo *get_backing_tensor_info(arm_compute::graph::Ten
     return ((tensor == nullptr) || (tensor->handle() == nullptr)) ? nullptr : tensor->handle()->tensor().info();
 }
 
+/** Validates a ArgMinMax layer node
+ *
+ * @tparam ArgMinMax layer function type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename ArgMinMaxLayer>
+Status validate_arg_min_max_layer(ArgMinMaxLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ArgMinMaxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *input  = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+
+    // Validate function
+    return ArgMinMaxLayer::validate(input, node.axis(), output, node.reduction_operation());
+}
+
 /** Validates a Bounding Box Transform layer node
  *
  * @tparam BoundingBoxTransformLayer  Bounding Box Transform layer function type
@@ -63,7 +86,8 @@ inline arm_compute::ITensorInfo *get_backing_tensor_info(arm_compute::graph::Ten
 template <typename BoundingBoxTransformLayer>
 Status validate_bounding_box_transform_layer(BoundingBoxTransformLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating BoundingBoxTransformLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating BoundingBoxTransformLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -87,7 +111,8 @@ Status validate_bounding_box_transform_layer(BoundingBoxTransformLayerNode &node
 template <typename ChannelShuffleLayer>
 Status validate_channel_shuffle_layer(ChannelShuffleLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ChannelShuffle node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ChannelShuffle node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -110,10 +135,14 @@ Status validate_channel_shuffle_layer(ChannelShuffleLayerNode &node)
  *
  * @return Status
  */
-template <typename ConvolutionLayer, typename DirectConvolutionLayer, typename GEMMConvolutionLayer, typename WinogradConvolutionLayer>
+template <typename ConvolutionLayer,
+          typename DirectConvolutionLayer,
+          typename GEMMConvolutionLayer,
+          typename WinogradConvolutionLayer>
 Status validate_convolution_layer(ConvolutionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -123,7 +152,7 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
     arm_compute::ITensorInfo *biases  = get_backing_tensor_info(node.input(2));
     arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         biases->set_data_type(DataType::S32);
     }
@@ -135,23 +164,24 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
 
     // Validate function
     Status status{};
-    switch(conv_algorithm)
+    switch (conv_algorithm)
     {
         case ConvolutionMethod::Direct:
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "DirectConvolutionLayer does not support grouping!");
             status = DirectConvolutionLayer::validate(input, weights, biases, output, conv_info);
             break;
         case ConvolutionMethod::GEMM:
-            status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                                    WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), num_groups);
+            status = GEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, WeightsInfo(),
+                                                    Size2D(1, 1), ActivationLayerInfo(), num_groups);
             break;
         case ConvolutionMethod::Winograd:
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups != 1, "WinogradConvolutionLayer does not support grouping!");
-            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, ActivationLayerInfo(), fast_math);
+            status = WinogradConvolutionLayer::validate(input, weights, biases, output, conv_info,
+                                                        ActivationLayerInfo(), fast_math);
             break;
         case ConvolutionMethod::Default:
-            status = ConvolutionLayer::validate(input, weights, biases, output, conv_info,
-                                                WeightsInfo(), Size2D(1, 1), ActivationLayerInfo(), fast_math, num_groups);
+            status = ConvolutionLayer::validate(input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1),
+                                                ActivationLayerInfo(), fast_math, num_groups);
             break;
         default:
             ARM_COMPUTE_RETURN_ERROR_MSG("Unsupported convolution method");
@@ -171,7 +201,8 @@ Status validate_convolution_layer(ConvolutionLayerNode &node)
 template <typename DepthwiseConvolutionLayer>
 Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DepthwiseConvolutionLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -187,7 +218,7 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 
     // Validate function
     Status status{};
-    switch(dwc_algorithm)
+    switch (dwc_algorithm)
     {
         case DepthwiseConvolutionMethod::Default:
         case DepthwiseConvolutionMethod::Optimized3x3:
@@ -199,6 +230,28 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 
     return status;
 }
+/** Validates a depth to space layer node
+ *
+ * @tparam DequantizationLayer Dequantize layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename DepthToSpaceLayer>
+Status validate_depth_to_space_layer(DepthToSpaceLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *input  = get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+
+    return DepthToSpaceLayer::validate(input, output, node.block_shape());
+}
 /** Validates a dequantize layer node
  *
  * @tparam DequantizationLayer Dequantize layer type
@@ -210,7 +263,8 @@ Status validate_depthwise_convolution_layer(DepthwiseConvolutionLayerNode &node)
 template <typename DequantizationLayer>
 Status validate_dequantization_layer(DequantizationLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -231,7 +285,8 @@ Status validate_dequantization_layer(DequantizationLayerNode &node)
 template <typename DetectionOutputLayer>
 Status validate_detection_output_layer(DetectionOutputLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -255,7 +310,8 @@ Status validate_detection_output_layer(DetectionOutputLayerNode &node)
 template <typename DetectionPostProcessLayer>
 Status validate_detection_post_process_layer(DetectionPostProcessLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionPostProcessLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionPostProcessLayer node with ID : " << node.id() << " and Name: "
+                                                                                         << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 4);
 
@@ -283,7 +339,8 @@ Status validate_detection_post_process_layer(DetectionPostProcessLayerNode &node
 template <typename GenerateProposalsLayer>
 Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating GenerateProposalsLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating GenerateProposalsLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 3);
 
@@ -299,6 +356,32 @@ Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
     return GenerateProposalsLayer::validate(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 }
 
+/** Validates a L2Normalization layer node
+ *
+ * @tparam L2Normalization layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename L2NormalizeLayer>
+Status validate_l2_normalize_layer(L2NormalizeLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating L2NormalizeLayerNode node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo *input   = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output  = get_backing_tensor_info(node.output(0));
+    int                       axis    = node.axis();
+    float                     epsilon = node.epsilon();
+
+    // Validate function
+    return L2NormalizeLayer::validate(input, output, axis, epsilon);
+}
+
 /** Validates a NormalizePlanarYUV layer node
  *
  * @tparam NormalizePlanarYUVLayer layer type
@@ -310,7 +393,8 @@ Status validate_generate_proposals_layer(GenerateProposalsLayerNode &node)
 template <typename NormalizePlanarYUVLayer>
 Status validate_normalize_planar_yuv_layer(NormalizePlanarYUVLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating NormalizePlanarYUVLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating NormalizePlanarYUVLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -335,7 +419,8 @@ Status validate_normalize_planar_yuv_layer(NormalizePlanarYUVLayerNode &node)
 template <typename PadLayer>
 Status validate_pad_layer(PadLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PadLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PadLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                        << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -358,14 +443,15 @@ Status validate_pad_layer(PadLayerNode &node)
 template <typename PermuteLayer>
 Status validate_permute_layer(PermuteLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PermuteLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PermuteLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract IO and info
     arm_compute::ITensorInfo *input  = get_backing_tensor_info(node.input(0));
     arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
-    const PermutationVector &perm   = node.permutation_vector();
+    const PermutationVector  &perm   = node.permutation_vector();
 
     return PermuteLayer::validate(input, output, perm);
 }
@@ -381,7 +467,8 @@ Status validate_permute_layer(PermuteLayerNode &node)
 template <typename PReluLayer>
 Status validate_prelu_layer(PReluLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PRelu node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PRelu node with ID : " << node.id() << " and Name: " << node.name()
+                                                                     << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -404,7 +491,8 @@ Status validate_prelu_layer(PReluLayerNode &node)
 template <typename PriorBoxLayer>
 Status validate_priorbox_layer(PriorBoxLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating PriorBoxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating PriorBoxLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -428,7 +516,8 @@ Status validate_priorbox_layer(PriorBoxLayerNode &node)
 template <typename QuantizationLayer>
 Status validate_quantization_layer(QuantizationLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating QuantizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating QuantizationLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -440,6 +529,31 @@ Status validate_quantization_layer(QuantizationLayerNode &node)
     return QuantizationLayer::validate(input, output);
 }
 
+/** Validates a Reduction operation layer node
+ *
+ * @tparam ReductionLayer Reduction layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename ReductionLayer>
+Status validate_reduction_operation_layer(ReductionLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ReductionLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract input and output
+    arm_compute::ITensorInfo *input  = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+
+    // Validate function
+    return ReductionLayer::validate(input, output, node.axis(), node.op(), node.keep_dims());
+}
+
 /** Validates a Reorg layer node
  *
  * @tparam ReorgLayer Reorg layer type
@@ -451,7 +565,8 @@ Status validate_quantization_layer(QuantizationLayerNode &node)
 template <typename ReorgLayer>
 Status validate_reorg_layer(ReorgLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReorgLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReorgLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                          << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -474,7 +589,8 @@ Status validate_reorg_layer(ReorgLayerNode &node)
 template <typename ReshapeLayer>
 Status validate_reshape_layer(ReshapeLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ReshapeLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -497,14 +613,15 @@ Status validate_reshape_layer(ReshapeLayerNode &node)
 template <typename ROIAlignLayer>
 Status validate_roi_align_layer(ROIAlignLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating ROIAlignLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+        "Validating ROIAlignLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract input and output
-    arm_compute::ITensorInfo *input     = detail::get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *rois      = detail::get_backing_tensor_info(node.input(1));
-    arm_compute::ITensorInfo *output    = detail::get_backing_tensor_info(node.output(0));
+    arm_compute::ITensorInfo  *input     = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo  *rois      = detail::get_backing_tensor_info(node.input(1));
+    arm_compute::ITensorInfo  *output    = detail::get_backing_tensor_info(node.output(0));
     const ROIPoolingLayerInfo &pool_info = node.pooling_info();
 
     // Validate function
@@ -522,7 +639,8 @@ Status validate_roi_align_layer(ROIAlignLayerNode &node)
 template <typename SliceLayer>
 Status validate_slice_layer(SliceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Slice node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating Slice node with ID : " << node.id() << " and Name: " << node.name()
+                                                                     << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
@@ -535,53 +653,120 @@ Status validate_slice_layer(SliceLayerNode &node)
     return SliceLayer::validate(input, output, starts, ends);
 }
 
-/** Validates a Upsample layer node
+/** Validates a Strided Slice layer node
  *
- * @tparam UpsampleLayer Upsample layer type
+ * @tparam StridedSliceLayer Strided Slice layer function type
  *
  * @param[in] node Node to validate
  *
  * @return Status
  */
-template <typename UpsampleLayer>
-Status validate_upsample_layer(UpsampleLayerNode &node)
+template <typename StridedSliceLayer>
+Status validate_strided_slice_layer(StridedSliceLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating UpsampleLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating StridedSlice node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
+    // Extract IO and info
+    arm_compute::ITensorInfo   *input   = get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo   *output  = get_backing_tensor_info(node.output(0));
+    const Coordinates           starts  = node.starts();
+    const Coordinates           ends    = node.ends();
+    const BiStrides             strides = node.strides();
+    const StridedSliceLayerInfo info    = node.strided_slice_info();
+
+    return StridedSliceLayer::validate(input, output, starts, ends, strides, info.begin_mask(), info.end_mask(),
+                                       info.shrink_axis_mask());
+}
+
+/** Validates a element-wise layer node
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename EltwiseLayerFunctions>
+Status validate_eltwise_Layer(EltwiseLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
     // Extract input and output
-    arm_compute::ITensorInfo *input  = detail::get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+    const arm_compute::ITensorInfo *input1         = detail::get_backing_tensor_info(node.input(0));
+    const arm_compute::ITensorInfo *input2         = detail::get_backing_tensor_info(node.input(1));
+    const arm_compute::ITensorInfo *output         = get_backing_tensor_info(node.output(0));
+    const EltwiseOperation          eltwise_op     = node.eltwise_operation();
+    const ConvertPolicy             convert_policy = node.convert_policy();
+    const RoundingPolicy            round_policy   = node.rounding_policy();
+    const ActivationLayerInfo       act_info       = node.fused_activation();
+    const QuantizationInfo          quant_info     = node.output_quant_info();
 
     // Validate function
-    return UpsampleLayer::validate(input, output, node.info(), node.upsampling_policy());
+    if (eltwise_op == EltwiseOperation::Add)
+    {
+        return EltwiseLayerFunctions::ArithmeticAddition::validate(input1, input2, output, convert_policy, act_info);
+    }
+    else if (eltwise_op == EltwiseOperation::Sub)
+    {
+        return EltwiseLayerFunctions::ArithmeticSubtraction::validate(input1, input2, output, convert_policy, act_info);
+    }
+    else if (eltwise_op == EltwiseOperation::Mul)
+    {
+        return EltwiseLayerFunctions::PixelWiseMultiplication::validate(input1, input2, output, 1.0f, convert_policy,
+                                                                        round_policy, act_info);
+    }
+    else if (eltwise_op == EltwiseOperation::Max)
+    {
+        return EltwiseLayerFunctions::ElementwiseMax::validate(input1, input2, output, act_info);
+    }
+    else if (eltwise_op == EltwiseOperation::Div)
+    {
+        return EltwiseLayerFunctions::ArithmeticDivision::validate(input1, input2, output, act_info);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported element-wise operation!");
+    }
+    return Status{};
 }
-/** Validates a YOLO layer node
- *
- * @tparam YOLOLayer YOLO layer type
+/** Validates a unary element-wise layer node
  *
  * @param[in] node Node to validate
  *
  * @return Status
  */
-template <typename YOLOLayer>
-Status validate_yolo_layer(YOLOLayerNode &node)
+template <typename UnaryEltwiseLayerFunctions>
+Status validate_unary_eltwise_layer(UnaryEltwiseLayerNode &node)
 {
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating YOLOLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating EltwiseLayer node with ID : " << node.id() << " and Name: " << node.name()
+                                                                            << std::endl);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
 
     // Extract input and output
-    arm_compute::ITensorInfo *input  = detail::get_backing_tensor_info(node.input(0));
-    arm_compute::ITensorInfo *output = get_backing_tensor_info(node.output(0));
+    arm_compute::ITensorInfo   *input      = detail::get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo   *output     = get_backing_tensor_info(node.output(0));
+    const UnaryEltwiseOperation eltwise_op = node.eltwise_descriptor().op;
 
     // Validate function
-    return YOLOLayer::validate(input, output, node.activation_info(), node.num_classes());
+    if (eltwise_op == UnaryEltwiseOperation::Exp)
+    {
+        return UnaryEltwiseLayerFunctions::ExpLayer::validate(input, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Unsupported unary element-wise operation!");
+    }
+
+    return Status{};
 }
 } // namespace detail
 } // namespace backends
 } // namespace graph
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_GRAPH_BACKENDS_DETAIL_VALIDATE_HELPERS_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_BACKENDS_VALIDATEHELPERS_H
diff --git a/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h b/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h
index 30c084edf5..1f43ac4197 100644
--- a/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h
+++ b/arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/detail/ExecutionHelpers.h b/arm_compute/graph/detail/ExecutionHelpers.h
index aa1af27052..b1662bb82f 100644
--- a/arm_compute/graph/detail/ExecutionHelpers.h
+++ b/arm_compute/graph/detail/ExecutionHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/frontend/ILayer.h b/arm_compute/graph/frontend/ILayer.h
index 30652a7406..7eb405b739 100644
--- a/arm_compute/graph/frontend/ILayer.h
+++ b/arm_compute/graph/frontend/ILayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/frontend/IStream.h b/arm_compute/graph/frontend/IStream.h
index e155bbc725..1831ac0be3 100644
--- a/arm_compute/graph/frontend/IStream.h
+++ b/arm_compute/graph/frontend/IStream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -84,8 +84,8 @@ public:
     }
 
 protected:
-    StreamHints _hints     = {};              /**< Execution and algorithmic hints */
-    NodeID      _tail_node = { EmptyNodeID }; /**< NodeID pointing to the last(tail) node of the graph */
+    StreamHints _hints     = {};            /**< Execution and algorithmic hints */
+    NodeID      _tail_node = {EmptyNodeID}; /**< NodeID pointing to the last(tail) node of the graph */
 };
 } // namespace frontend
 } // namespace graph
diff --git a/arm_compute/graph/frontend/IStreamOperators.h b/arm_compute/graph/frontend/IStreamOperators.h
index 8e2ca41c4e..deaf66daef 100644
--- a/arm_compute/graph/frontend/IStreamOperators.h
+++ b/arm_compute/graph/frontend/IStreamOperators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index 61a6fd47ee..bd321e6f1a 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_GRAPH_LAYERS_H
 #define ARM_COMPUTE_GRAPH_LAYERS_H
 
-#include "arm_compute/graph/GraphBuilder.h"
-#include "arm_compute/graph/Types.h"
+#include "arm_compute/core/utils/misc/Utility.h"
 #include "arm_compute/graph/frontend/ILayer.h"
 #include "arm_compute/graph/frontend/IStream.h"
 #include "arm_compute/graph/frontend/SubStream.h"
-
-#include "arm_compute/core/utils/misc/Utility.h"
+#include "arm_compute/graph/GraphBuilder.h"
+#include "arm_compute/graph/Types.h"
 
 #include <memory>
 #include <string>
@@ -50,14 +49,13 @@ public:
      * @param[in] desc     Description of input tensor.
      * @param[in] accessor Accessor to get input tensor data from.
      */
-    InputLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor)
-        : _desc(desc), _accessor(std::move(accessor))
+    InputLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor) : _desc(desc), _accessor(std::move(accessor))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams common_params = { name(), s.hints().target_hint };
+        NodeParams common_params = {name(), s.hints().target_hint};
         return GraphBuilder::add_input_node(s.graph(), common_params, _desc, std::move(_accessor));
     }
 
@@ -75,14 +73,13 @@ public:
      * @param[in] desc     Description of input tensor.
      * @param[in] accessor Accessor to get input tensor data from.
      */
-    ConstantLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor)
-        : _desc(desc), _accessor(std::move(accessor))
+    ConstantLayer(TensorDescriptor desc, ITensorAccessorUPtr accessor) : _desc(desc), _accessor(std::move(accessor))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams common_params = { name(), s.hints().target_hint };
+        NodeParams common_params = {name(), s.hints().target_hint};
         return GraphBuilder::add_const_node(s.graph(), common_params, _desc, std::move(_accessor));
     }
 
@@ -107,8 +104,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), _connection_idx };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), _connection_idx};
         return GraphBuilder::add_output_node(s.graph(), common_params, input, std::move(_accessor));
     }
 
@@ -126,18 +123,17 @@ public:
      * @param[in] act_info       Activation information
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ActivationLayer(ActivationLayerInfo    act_info,
-                    const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _act_info(act_info),
-          _out_quant_info(std::move(out_quant_info))
+    ActivationLayer(ActivationLayerInfo act_info, const QuantizationInfo out_quant_info = QuantizationInfo())
+        : _act_info(act_info), _out_quant_info(std::move(out_quant_info))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_activation_node(s.graph(), common_params, input, _act_info, std::move(_out_quant_info));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_activation_node(s.graph(), common_params, input, _act_info,
+                                                 std::move(_out_quant_info));
     }
 
 private:
@@ -145,6 +141,46 @@ private:
     const QuantizationInfo _out_quant_info;
 };
 
+/** ArgMinMax Layer */
+class ArgMinMaxLayer final : public ILayer
+{
+public:
+    /** Construct an activation layer.
+     *
+     * @param[in] op             Reduction Operation: min or max
+     * @param[in] axis           Axis to perform reduction along
+     * @param[in] out_data_type  (Optional) Output tensor data type
+     * @param[in] out_quant_info (Optional) Output quantization info
+     */
+    ArgMinMaxLayer(ReductionOperation     op,
+                   unsigned int           axis,
+                   DataType               out_data_type  = DataType::UNKNOWN,
+                   const QuantizationInfo out_quant_info = QuantizationInfo())
+        : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
+    {
+    }
+
+    /** Create layer and add to the given stream.
+     *
+     * @param[in] s Stream to add layer to.
+     *
+     * @return ID of the created node.
+     */
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_arg_min_max_node(s.graph(), common_params, input, _op, _axis, _out_data_type,
+                                                  std::move(_out_quant_info));
+    }
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    DataType           _out_data_type;
+    QuantizationInfo   _out_quant_info;
+};
+
 /** Batchnormalization Layer */
 class BatchNormalizationLayer final : public ILayer
 {
@@ -162,7 +198,11 @@ public:
                             ITensorAccessorUPtr gamma   = nullptr,
                             ITensorAccessorUPtr beta    = nullptr,
                             float               epsilon = 0.001f)
-        : _mean(std::move(mean)), _var(std::move(var)), _gamma(std::move(gamma)), _beta(std::move(beta)), _epsilon(epsilon)
+        : _mean(std::move(mean)),
+          _var(std::move(var)),
+          _gamma(std::move(gamma)),
+          _beta(std::move(beta)),
+          _epsilon(epsilon)
     {
     }
 
@@ -171,10 +211,10 @@ public:
         ARM_COMPUTE_ERROR_ON(_mean == nullptr);
         ARM_COMPUTE_ERROR_ON(_var == nullptr);
 
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_batch_normalization_node(s.graph(), common_params, input, _epsilon,
-                                                          std::move(_mean), std::move(_var), std::move(_beta), std::move(_gamma));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_batch_normalization_node(s.graph(), common_params, input, _epsilon, std::move(_mean),
+                                                          std::move(_var), std::move(_beta), std::move(_gamma));
     }
 
 private:
@@ -195,7 +235,9 @@ public:
      * @param[in] sub_stream_deltas Graph sub-stream for the deltas
      * @param[in] info              Contains BoundingBox operation information described in @ref BoundingBoxTransformInfo.
      */
-    BoundingBoxTransformLayer(SubStream &&sub_stream_input, SubStream &&sub_stream_deltas, BoundingBoxTransformInfo info)
+    BoundingBoxTransformLayer(SubStream              &&sub_stream_input,
+                              SubStream              &&sub_stream_deltas,
+                              BoundingBoxTransformInfo info)
         : _ss_input(sub_stream_input), _ss_deltas(sub_stream_deltas), _bbox_info(info)
     {
     }
@@ -208,9 +250,9 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss_input.tail_node(), 0 };
-        NodeIdxPair deltas        = { _ss_deltas.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss_input.tail_node(), 0};
+        NodeIdxPair deltas        = {_ss_deltas.tail_node(), 0};
         return GraphBuilder::add_bounding_box_transform_node(s.graph(), common_params, input, deltas, _bbox_info);
     }
 
@@ -228,15 +270,14 @@ public:
      *
      * @param[in] num_groups Number of groups
      */
-    ChannelShuffleLayer(unsigned int num_groups)
-        : _num_groups(num_groups)
+    ChannelShuffleLayer(unsigned int num_groups) : _num_groups(num_groups)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_channel_shuffle_node(s.graph(), common_params, input, _num_groups);
     }
 
@@ -255,17 +296,15 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    ConcatLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    ConcatLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams)
         : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
     {
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream1)));
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream2)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concatenation layer
      *
@@ -275,33 +314,33 @@ public:
      * @param[in] rest_sub_streams  Rest sub-graph branches
      */
     template <typename... Ts>
-    ConcatLayer(descriptors::ConcatLayerDescriptor concat_descriptor, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    ConcatLayer(descriptors::ConcatLayerDescriptor concat_descriptor,
+                SubStream                        &&sub_stream1,
+                SubStream                        &&sub_stream2,
+                Ts &&...rest_sub_streams)
         : _sub_streams(), _concat_descriptor(concat_descriptor)
     {
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream1)));
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream2)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concat layer
      *
      * @param[in] sub_stream Sub-stream
      */
     template <typename... Ts>
-    ConcatLayer(SubStream &&sub_stream)
-        : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
+    ConcatLayer(SubStream &&sub_stream) : _sub_streams(), _concat_descriptor(DataLayoutDimension::CHANNEL)
     {
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
     }
     NodeID create_layer(IStream &s) override
     {
         NodeID     nid           = EmptyNodeID;
-        NodeParams common_params = { name(), s.hints().target_hint };
-        if(_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
+        NodeParams common_params = {name(), s.hints().target_hint};
+        if (_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
         {
             nid = _sub_streams[0]->tail_node();
         }
@@ -309,14 +348,14 @@ public:
         {
             // Collect tail nodes and concatenate
             std::vector<NodeIdxPair> nodes;
-            for(auto &ss : _sub_streams)
+            for (auto &ss : _sub_streams)
             {
-                if(ss && (ss->tail_node() != EmptyNodeID))
+                if (ss && (ss->tail_node() != EmptyNodeID))
                 {
                     const auto tail_node = s.graph().node(ss->tail_node());
-                    if(tail_node != nullptr && tail_node->type() != NodeType::Output)
+                    if (tail_node != nullptr && tail_node->type() != NodeType::Output)
                     {
-                        nodes.push_back({ ss->tail_node(), 0 });
+                        nodes.push_back({ss->tail_node(), 0});
                     }
                 }
             }
@@ -369,12 +408,12 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_convolution_node(s.graph(), common_params, input,
-                                                  Size2D(_conv_width, _conv_height), _ofm, _conv_info, _num_groups,
-                                                  s.hints().convolution_method_hint, s.hints().fast_math_hint,
-                                                  std::move(_weights), std::move(_bias), std::move(_weights_quant_info), std::move(_out_quant_info));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_convolution_node(s.graph(), common_params, input, Size2D(_conv_width, _conv_height),
+                                                  _ofm, _conv_info, _num_groups, s.hints().convolution_method_hint,
+                                                  s.hints().fast_math_hint, std::move(_weights), std::move(_bias),
+                                                  std::move(_weights_quant_info), std::move(_out_quant_info));
     }
 
 private:
@@ -419,11 +458,10 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_deconvolution_node(s.graph(), common_params, input,
-                                                    Size2D(_conv_width, _conv_height), _ofm, _deconv_info,
-                                                    std::move(_weights), std::move(_bias));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_deconvolution_node(s.graph(), common_params, input, Size2D(_conv_width, _conv_height),
+                                                    _ofm, _deconv_info, std::move(_weights), std::move(_bias));
     }
 
 private:
@@ -471,12 +509,12 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        return GraphBuilder::add_depthwise_convolution_node(s.graph(), common_params,
-                                                            input, Size2D(_conv_width, _conv_height), _conv_info, _depth_multiplier,
-                                                            s.hints().depthwise_convolution_method_hint,
-                                                            std::move(_weights), std::move(_bias), std::move(_weights_quant_info), std::move(_out_quant_info));
+        NodeIdxPair input         = {s.tail_node(), 0};
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        return GraphBuilder::add_depthwise_convolution_node(
+            s.graph(), common_params, input, Size2D(_conv_width, _conv_height), _conv_info, _depth_multiplier,
+            s.hints().depthwise_convolution_method_hint, std::move(_weights), std::move(_bias),
+            std::move(_weights_quant_info), std::move(_out_quant_info));
     }
 
 private:
@@ -489,6 +527,30 @@ private:
     const QuantizationInfo _weights_quant_info;
     const QuantizationInfo _out_quant_info;
 };
+
+/** DepthToSpace Layer */
+class DepthToSpaceLayer final : public ILayer
+{
+public:
+    /** Construct an DepthToSpace layer.
+     *
+     * @param[in] block_shape Block size to rearranged
+     */
+    DepthToSpaceLayer(int32_t block_shape) : _block_shape(block_shape)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_depth_to_space_node(s.graph(), common_params, input, _block_shape);
+    }
+
+private:
+    int32_t _block_shape;
+};
+
 /** Dequantization Layer */
 class DequantizationLayer final : public ILayer
 {
@@ -502,8 +564,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_dequantization_node(s.graph(), common_params, input);
     }
 };
@@ -518,18 +580,21 @@ public:
      * @param[in] sub_stream_prior PriorBox graph sub-stream.
      * @param[in] detect_info      DetectionOutput parameters.
      */
-    DetectionOutputLayer(SubStream &&sub_stream_conf, SubStream &&sub_stream_prior, const DetectionOutputLayerInfo &detect_info)
+    DetectionOutputLayer(SubStream                     &&sub_stream_conf,
+                         SubStream                     &&sub_stream_prior,
+                         const DetectionOutputLayerInfo &detect_info)
         : _ss_conf(std::move(sub_stream_conf)), _ss_prior(std::move(sub_stream_prior)), _detect_info(detect_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params  = { name(), s.hints().target_hint };
-        NodeIdxPair input_loc      = { s.tail_node(), 0 };
-        NodeIdxPair input_conf     = { _ss_conf.tail_node(), 0 };
-        NodeIdxPair input_priorbox = { _ss_prior.tail_node(), 0 };
-        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox, _detect_info);
+        NodeParams  common_params  = {name(), s.hints().target_hint};
+        NodeIdxPair input_loc      = {s.tail_node(), 0};
+        NodeIdxPair input_conf     = {_ss_conf.tail_node(), 0};
+        NodeIdxPair input_priorbox = {_ss_prior.tail_node(), 0};
+        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox,
+                                                       _detect_info);
     }
 
 private:
@@ -548,9 +613,14 @@ public:
      * @param[in] anchors                     Accessor to get anchors tensor data from.
      * @param[in] out_quant_info              (Optional) Output quantization info
      */
-    DetectionPostProcessLayer(SubStream &&sub_stream_class_prediction, DetectionPostProcessLayerInfo detect_info, ITensorAccessorUPtr anchors,
-                              const QuantizationInfo out_quant_info = QuantizationInfo())
-        : _sub_stream_class_prediction(std::move(sub_stream_class_prediction)), _detect_info(detect_info), _anchors(std::move(anchors)), _out_quant_info(std::move(out_quant_info))
+    DetectionPostProcessLayer(SubStream                   &&sub_stream_class_prediction,
+                              DetectionPostProcessLayerInfo detect_info,
+                              ITensorAccessorUPtr           anchors,
+                              const QuantizationInfo        out_quant_info = QuantizationInfo())
+        : _sub_stream_class_prediction(std::move(sub_stream_class_prediction)),
+          _detect_info(detect_info),
+          _anchors(std::move(anchors)),
+          _out_quant_info(std::move(out_quant_info))
     {
     }
 
@@ -558,10 +628,12 @@ public:
     {
         ARM_COMPUTE_ERROR_ON(_anchors == nullptr);
 
-        NodeParams  common_params          = { name(), s.hints().target_hint };
-        NodeIdxPair input_box_encoding     = { s.tail_node(), 0 };
-        NodeIdxPair input_class_prediction = { _sub_stream_class_prediction.tail_node(), 0 };
-        return GraphBuilder::add_detection_post_process_node(s.graph(), common_params, input_box_encoding, input_class_prediction, _detect_info, std::move(_anchors), std::move(_out_quant_info));
+        NodeParams  common_params          = {name(), s.hints().target_hint};
+        NodeIdxPair input_box_encoding     = {s.tail_node(), 0};
+        NodeIdxPair input_class_prediction = {_sub_stream_class_prediction.tail_node(), 0};
+        return GraphBuilder::add_detection_post_process_node(s.graph(), common_params, input_box_encoding,
+                                                             input_class_prediction, _detect_info, std::move(_anchors),
+                                                             std::move(_out_quant_info));
     }
 
 private:
@@ -578,15 +650,14 @@ public:
      *
      * @param[in] shape Output shape
      */
-    DummyLayer(TensorShape shape)
-        : _shape(shape)
+    DummyLayer(TensorShape shape) : _shape(shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_dummy_node(s.graph(), common_params, input, _shape);
     }
 
@@ -610,9 +681,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input0        = { _ss0.tail_node(), 0 };
-        NodeIdxPair input1        = { _ss1.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input0        = {_ss0.tail_node(), 0};
+        NodeIdxPair input1        = {_ss1.tail_node(), 0};
 
         return GraphBuilder::add_elementwise_node(s.graph(), common_params, input0, input1, _op);
     }
@@ -633,8 +704,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_flatten_node(s.graph(), common_params, input);
     }
 };
@@ -687,8 +758,8 @@ public:
         : _num_outputs(num_outputs),
           _weights(nullptr),
           _bias(nullptr),
-          _weights_ss(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream_weights))),
-          _bias_ss(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream_bias))),
+          _weights_ss(std::make_unique<SubStream>(std::move(sub_stream_weights))),
+          _bias_ss(std::make_unique<SubStream>(std::move(sub_stream_bias))),
           _fc_info(fc_info),
           _weights_quant_info(std::move(weights_quant_info)),
           _out_quant_info(std::move(out_quant_info))
@@ -703,13 +774,13 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        if(_weights != nullptr)
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        if (_weights != nullptr)
         {
-            return GraphBuilder::add_fully_connected_layer(s.graph(), common_params, input, _num_outputs,
-                                                           std::move(_weights), std::move(_bias), _fc_info,
-                                                           std::move(_weights_quant_info), std::move(_out_quant_info));
+            return GraphBuilder::add_fully_connected_layer(
+                s.graph(), common_params, input, _num_outputs, std::move(_weights), std::move(_bias), _fc_info,
+                std::move(_weights_quant_info), std::move(_out_quant_info), s.hints().fast_math_hint);
         }
         else
         {
@@ -718,7 +789,7 @@ public:
             NodeID bias_nid = (_bias_ss == nullptr) ? EmptyNodeID : _bias_ss->tail_node();
             return GraphBuilder::add_fully_connected_layer(s.graph(), common_params, input, _num_outputs,
                                                            _weights_ss->tail_node(), bias_nid, _fc_info,
-                                                           std::move(_out_quant_info));
+                                                           std::move(_out_quant_info), s.hints().fast_math_hint);
         }
     }
 
@@ -744,8 +815,14 @@ public:
      * @param[in] ss_anchors Graph sub-stream for the anchors.
      * @param[in] info       Generate Proposals operation information.
      */
-    GenerateProposalsLayer(SubStream &&ss_scores, SubStream &&ss_deltas, SubStream &&ss_anchors, GenerateProposalsInfo info)
-        : _ss_scores(std::move(ss_scores)), _ss_deltas(std::move(ss_deltas)), _ss_anchors(std::move(ss_anchors)), _info(info)
+    GenerateProposalsLayer(SubStream           &&ss_scores,
+                           SubStream           &&ss_deltas,
+                           SubStream           &&ss_anchors,
+                           GenerateProposalsInfo info)
+        : _ss_scores(std::move(ss_scores)),
+          _ss_deltas(std::move(ss_deltas)),
+          _ss_anchors(std::move(ss_anchors)),
+          _info(info)
     {
     }
 
@@ -757,10 +834,10 @@ public:
      */
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair scores        = { _ss_scores.tail_node(), 0 };
-        NodeIdxPair deltas        = { _ss_deltas.tail_node(), 0 };
-        NodeIdxPair anchors       = { _ss_anchors.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair scores        = {_ss_scores.tail_node(), 0};
+        NodeIdxPair deltas        = {_ss_deltas.tail_node(), 0};
+        NodeIdxPair anchors       = {_ss_anchors.tail_node(), 0};
         return GraphBuilder::add_generate_proposals_node(s.graph(), common_params, scores, deltas, anchors, _info);
     }
 
@@ -771,6 +848,31 @@ private:
     GenerateProposalsInfo _info;
 };
 
+/** L2 Normalize Layer */
+class L2NormalizeLayer final : public ILayer
+{
+public:
+    /** Construct a L2 Normalize layer.
+     *
+     * @param[in] axis    Axis to perform normalization on
+     * @param[in] epsilon Lower bound value for the normalization
+     */
+    L2NormalizeLayer(int axis, float epsilon) : _axis(axis), _epsilon(epsilon)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_l2_normalize_node(s.graph(), common_params, input, _axis, _epsilon);
+    }
+
+private:
+    int   _axis;
+    float _epsilon;
+};
+
 /** Normalization Layer */
 class NormalizationLayer final : public ILayer
 {
@@ -779,15 +881,14 @@ public:
      *
      * @param[in] norm_info Normalization information.
      */
-    NormalizationLayer(NormalizationLayerInfo norm_info)
-        : _norm_info(norm_info)
+    NormalizationLayer(NormalizationLayerInfo norm_info) : _norm_info(norm_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_normalization_node(s.graph(), common_params, input, _norm_info);
     }
 
@@ -804,8 +905,7 @@ public:
      * @param[in] mean Accessor to get mean tensor data from.
      * @param[in] std  Accessor to get std tensor data from.
      */
-    NormalizePlanarYUVLayer(ITensorAccessorUPtr mean,
-                            ITensorAccessorUPtr std)
+    NormalizePlanarYUVLayer(ITensorAccessorUPtr mean, ITensorAccessorUPtr std)
         : _mean(std::move(mean)), _std(std::move(std))
     {
     }
@@ -815,10 +915,10 @@ public:
         ARM_COMPUTE_ERROR_ON(_mean == nullptr);
         ARM_COMPUTE_ERROR_ON(_std == nullptr);
 
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_normalize_planar_yuv_node(s.graph(), common_params, input,
-                                                           std::move(_mean), std::move(_std));
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_normalize_planar_yuv_node(s.graph(), common_params, input, std::move(_mean),
+                                                           std::move(_std));
     }
 
 private:
@@ -836,15 +936,14 @@ public:
      *                      specifies the front and the end padding in the i-th dimension.
      * @param[in] pad_value Padding value to use. Defaults to 0.
      */
-    PadLayer(PaddingList padding, PixelValue pad_value = PixelValue())
-        : _padding(padding), _pad_value(pad_value)
+    PadLayer(PaddingList padding, PixelValue pad_value = PixelValue()) : _padding(padding), _pad_value(pad_value)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_pad_node(s.graph(), common_params, input, _padding, _pad_value);
     }
 
@@ -863,15 +962,14 @@ public:
      * @param[in] layout (Optional) Data layout to assign to permuted tensor.
      *                   If UNKNOWN then the input's layout will be used.
      */
-    PermuteLayer(PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN)
-        : _perm(perm), _layout(layout)
+    PermuteLayer(PermutationVector perm, DataLayout layout = DataLayout::UNKNOWN) : _perm(perm), _layout(layout)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_permute_node(s.graph(), common_params, input, _perm, _layout);
     }
 
@@ -888,15 +986,14 @@ public:
      *
      * @param[in] pool_info Pooling information.
      */
-    PoolingLayer(PoolingLayerInfo pool_info)
-        : _pool_info(pool_info)
+    PoolingLayer(PoolingLayerInfo pool_info) : _pool_info(pool_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_pooling_node(s.graph(), common_params, input, _pool_info);
     }
 
@@ -920,9 +1017,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss0.tail_node(), 0 };
-        NodeIdxPair alpha         = { _ss1.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss0.tail_node(), 0};
+        NodeIdxPair alpha         = {_ss1.tail_node(), 0};
 
         return GraphBuilder::add_prelu_node(s.graph(), common_params, input, alpha);
     }
@@ -971,15 +1068,17 @@ public:
      * @param[in] format_info (Optional) Format info.
      * @param[in] transform   (Optional) Input transform function.
      */
-    PrintLayer(std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(), const std::function<ITensor *(ITensor *)> transform = nullptr)
+    PrintLayer(std::ostream                             &stream,
+               const IOFormatInfo                       &format_info = IOFormatInfo(),
+               const std::function<ITensor *(ITensor *)> transform   = nullptr)
         : _stream(stream), _format_info(format_info), _transform(transform)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_print_node(s.graph(), common_params, input, _stream, _format_info, _transform);
     }
 
@@ -1005,9 +1104,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input0        = { s.tail_node(), 0 };
-        NodeIdxPair input1        = { _ss.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input0        = {s.tail_node(), 0};
+        NodeIdxPair input1        = {_ss.tail_node(), 0};
         return GraphBuilder::add_priorbox_node(s.graph(), common_params, input0, input1, _prior_info);
     }
 
@@ -1024,15 +1123,14 @@ public:
      *
      * @param[in] out_quant_info Output tensor quantization info
      */
-    QuantizationLayer(QuantizationInfo out_quant_info)
-        : _out_quant_info(out_quant_info)
+    QuantizationLayer(QuantizationInfo out_quant_info) : _out_quant_info(out_quant_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_quantization_node(s.graph(), common_params, input, _out_quant_info);
     }
 
@@ -1040,6 +1138,34 @@ private:
     QuantizationInfo _out_quant_info;
 };
 
+/** Reduction Layer */
+class ReductionLayer final : public ILayer
+{
+public:
+    /** Construct a reduction layer.
+     *
+     * @param[in] op        Reduction operation
+     * @param[in] axis      Reduction axis
+     * @param[in] keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     */
+    ReductionLayer(ReductionOperation op, unsigned int axis, bool keep_dims)
+        : _op(op), _axis(axis), _keep_dims(keep_dims)
+    {
+    }
+
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_reduction_operation_node(s.graph(), common_params, input, _op, _axis, _keep_dims);
+    }
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    bool               _keep_dims;
+};
+
 /** Reorg Layer */
 class ReorgLayer final : public ILayer
 {
@@ -1049,15 +1175,14 @@ public:
      * @param[in] stride Stride value to use for reorganizing the values in the output tensor.
      *                   It defines the spatial distance between 2 consecutive pixels in the x and y direction
      */
-    ReorgLayer(int stride)
-        : _stride(stride)
+    ReorgLayer(int stride) : _stride(stride)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reorg_node(s.graph(), common_params, input, _stride);
     }
 
@@ -1073,15 +1198,14 @@ public:
      *
      * @param[in] shape Target shape.
      */
-    ReshapeLayer(TensorShape shape)
-        : _shape(shape)
+    ReshapeLayer(TensorShape shape) : _shape(shape)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_reshape_node(s.graph(), common_params, input, _shape);
     }
 
@@ -1100,8 +1224,8 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_resize_node(s.graph(), common_params, input, _policy, _width_scale, _height_scale);
     }
 
@@ -1133,9 +1257,9 @@ public:
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { _ss_input.tail_node(), 0 };
-        NodeIdxPair rois          = { _ss_rois.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {_ss_input.tail_node(), 0};
+        NodeIdxPair rois          = {_ss_rois.tail_node(), 0};
         return GraphBuilder::add_roi_align_node(s.graph(), common_params, input, rois, _pool_info);
     }
 
@@ -1154,16 +1278,15 @@ public:
      * @param[in] mul_w Accessor to get mul weight from.
      * @param[in] add_w Accessor to get add weight from.
      */
-    ScaleLayer(ITensorAccessorUPtr mul_w,
-               ITensorAccessorUPtr add_w)
+    ScaleLayer(ITensorAccessorUPtr mul_w, ITensorAccessorUPtr add_w)
         : _mul_w(std::move(mul_w)), _add_w(std::move(add_w))
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_scale_layer(s.graph(), common_params, input, std::move(_mul_w), std::move(_add_w));
     }
 
@@ -1181,15 +1304,14 @@ public:
      * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    SliceLayer(Coordinates &starts, Coordinates &ends)
-        : _starts(starts), _ends(ends)
+    SliceLayer(Coordinates &starts, Coordinates &ends) : _starts(starts), _ends(ends)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_slice_node(s.graph(), common_params, input, _starts, _ends);
     }
 
@@ -1206,15 +1328,14 @@ public:
      *
      * @param[in] beta (Optional) Beta value. Default 1.0.
      */
-    SoftmaxLayer(float beta = 1.0f)
-        : _beta(beta)
+    SoftmaxLayer(float beta = 1.0f) : _beta(beta)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
         return GraphBuilder::add_softmax_node(s.graph(), common_params, input, _beta);
     }
 
@@ -1233,17 +1354,14 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    StackLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
-        : _sub_streams(), _axis(0)
+    StackLayer(SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams) : _sub_streams(), _axis(0)
     {
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream1)));
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream2)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concatenation layer
      *
@@ -1253,33 +1371,30 @@ public:
      * @param[in] rest_sub_streams Rest sub-graph branches
      */
     template <typename... Ts>
-    StackLayer(int axis, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&... rest_sub_streams)
+    StackLayer(int axis, SubStream &&sub_stream1, SubStream &&sub_stream2, Ts &&...rest_sub_streams)
         : _sub_streams(), _axis(axis)
     {
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream1)));
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream2)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream1)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream2)));
 
-        utility::for_each([&](SubStream && sub_stream)
-        {
-            _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream)));
-        },
-        std::move(rest_sub_streams)...);
+        utility::for_each([&](SubStream &&sub_stream)
+                          { _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream))); },
+                          std::move(rest_sub_streams)...);
     }
     /** Construct a concat layer
      *
      * @param[in] sub_stream Sub-stream
      */
     template <typename... Ts>
-    StackLayer(SubStream &&sub_stream)
-        : _sub_streams(), _axis(0)
+    StackLayer(SubStream &&sub_stream) : _sub_streams(), _axis(0)
     {
-        _sub_streams.push_back(arm_compute::support::cpp14::make_unique<SubStream>(std::move(sub_stream)));
+        _sub_streams.push_back(std::make_unique<SubStream>(std::move(sub_stream)));
     }
     NodeID create_layer(IStream &s) override
     {
         NodeID     nid           = EmptyNodeID;
-        NodeParams common_params = { name(), s.hints().target_hint };
-        if(_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
+        NodeParams common_params = {name(), s.hints().target_hint};
+        if (_sub_streams.size() == 1 && _sub_streams.at(0) != nullptr)
         {
             nid = _sub_streams[0]->tail_node();
         }
@@ -1287,14 +1402,14 @@ public:
         {
             // Collect tail nodes and stack
             std::vector<NodeIdxPair> nodes;
-            for(auto &ss : _sub_streams)
+            for (auto &ss : _sub_streams)
             {
-                if(ss && (ss->tail_node() != EmptyNodeID))
+                if (ss && (ss->tail_node() != EmptyNodeID))
                 {
                     const auto tail_node = s.graph().node(ss->tail_node());
-                    if(tail_node != nullptr && tail_node->type() != NodeType::Output)
+                    if (tail_node != nullptr && tail_node->type() != NodeType::Output)
                     {
-                        nodes.push_back({ ss->tail_node(), 0 });
+                        nodes.push_back({ss->tail_node(), 0});
                     }
                 }
             }
@@ -1308,30 +1423,37 @@ private:
     int                                     _axis;
 };
 
-/** Upsample Layer */
-class UpsampleLayer final : public ILayer
+/** StridedSlice Layer */
+class StridedSliceLayer final : public ILayer
 {
 public:
-    /** Construct a Upsample layer.
+    /** Construct a strided slice layer.
      *
-     * @param[in] info              Stride info
-     * @param[in] upsampling_policy Upsampling policy
+     * @param[in] starts             The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends               The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides            The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strided_slice_info Contains masks for the starts, ends and strides
      */
-    UpsampleLayer(Size2D info, InterpolationPolicy upsampling_policy)
-        : _info(info), _upsampling_policy(upsampling_policy)
+    StridedSliceLayer(Coordinates          &starts,
+                      Coordinates          &ends,
+                      BiStrides            &strides,
+                      StridedSliceLayerInfo strided_slice_info)
+        : _starts(starts), _ends(ends), _strides(strides), _info(strided_slice_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_upsample_node(s.graph(), common_params, input, _info, _upsampling_policy);
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_strided_slice_node(s.graph(), common_params, input, _starts, _ends, _strides, _info);
     }
 
 private:
-    Size2D              _info;
-    InterpolationPolicy _upsampling_policy;
+    Coordinates           _starts;
+    Coordinates           _ends;
+    BiStrides             _strides;
+    StridedSliceLayerInfo _info;
 };
 
 /** YOLO Layer */
@@ -1340,24 +1462,21 @@ class YOLOLayer final : public ILayer
 public:
     /** Construct a YOLO layer.
      *
-     * @param[in] act_info    Activation info
-     * @param[in] num_classes Number of classes to activate
+     * @param[in] act_info Activation info
      */
-    YOLOLayer(ActivationLayerInfo act_info, int32_t num_classes)
-        : _act_info(act_info), _num_classes(num_classes)
+    YOLOLayer(ActivationLayerInfo act_info) : _act_info(act_info)
     {
     }
 
     NodeID create_layer(IStream &s) override
     {
-        NodeParams  common_params = { name(), s.hints().target_hint };
-        NodeIdxPair input         = { s.tail_node(), 0 };
-        return GraphBuilder::add_yolo_node(s.graph(), common_params, input, _act_info, _num_classes);
+        NodeParams  common_params = {name(), s.hints().target_hint};
+        NodeIdxPair input         = {s.tail_node(), 0};
+        return GraphBuilder::add_yolo_node(s.graph(), common_params, input, _act_info);
     }
 
 private:
     ActivationLayerInfo _act_info;
-    int32_t             _num_classes;
 };
 } // namespace frontend
 } // namespace graph
diff --git a/arm_compute/graph/frontend/Stream.h b/arm_compute/graph/frontend/Stream.h
index b52274eeae..7e760b6373 100644
--- a/arm_compute/graph/frontend/Stream.h
+++ b/arm_compute/graph/frontend/Stream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/graph/frontend/IStream.h"
 #include "arm_compute/graph/frontend/IStreamOperators.h"
 #include "arm_compute/graph/frontend/Types.h"
-
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
@@ -65,7 +64,7 @@ public:
     void run();
 
     // Inherited overridden methods
-    void add_layer(ILayer &layer) override;
+    void         add_layer(ILayer &layer) override;
     Graph       &graph() override;
     const Graph &graph() const override;
 
diff --git a/arm_compute/graph/frontend/SubStream.h b/arm_compute/graph/frontend/SubStream.h
index 3df737997e..c54317c52b 100644
--- a/arm_compute/graph/frontend/SubStream.h
+++ b/arm_compute/graph/frontend/SubStream.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -54,7 +54,7 @@ public:
     SubStream(IStream &s);
 
     // Inherited overridden methods
-    void add_layer(ILayer &layer) override;
+    void         add_layer(ILayer &layer) override;
     Graph       &graph() override;
     const Graph &graph() const override;
 
diff --git a/arm_compute/graph/frontend/Types.h b/arm_compute/graph/frontend/Types.h
index 741412d8f2..42b28b3cd2 100644
--- a/arm_compute/graph/frontend/Types.h
+++ b/arm_compute/graph/frontend/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,39 +33,40 @@ namespace graph
 namespace frontend
 {
 // Import types for graph
-using graph::DataType;
 using graph::DataLayout;
 using graph::DataLayoutDimension;
-using graph::TensorShape;
+using graph::DataType;
 using graph::PermutationVector;
+using graph::TensorShape;
 
 using graph::ActivationLayerInfo;
+using graph::ConvolutionMethod;
+using graph::DepthwiseConvolutionMethod;
+using graph::DimensionRoundingType;
 using graph::EltwiseOperation;
+using graph::FastMathHint;
 using graph::FullyConnectedLayerInfo;
+using graph::GraphConfig;
+using graph::InterpolationPolicy;
 using graph::NormalizationLayerInfo;
 using graph::NormType;
 using graph::PadStrideInfo;
 using graph::PoolingLayerInfo;
 using graph::PoolingType;
+using graph::Size2D;
 using graph::Target;
-using graph::ConvolutionMethod;
-using graph::FastMathHint;
-using graph::DepthwiseConvolutionMethod;
 using graph::TensorDescriptor;
-using graph::DimensionRoundingType;
-using graph::GraphConfig;
-using graph::InterpolationPolicy;
-using graph::Size2D;
 
 /** Hints that can be passed to the stream to expose parameterization */
 struct StreamHints
 {
-    Target                     target_hint                       = { Target::UNSPECIFIED };                 /**< Target execution hint */
-    ConvolutionMethod          convolution_method_hint           = { ConvolutionMethod::Default };          /**< Convolution method hint */
-    DepthwiseConvolutionMethod depthwise_convolution_method_hint = { DepthwiseConvolutionMethod::Default }; /**< Depthwise Convolution method hint */
-    FastMathHint               fast_math_hint                    = { FastMathHint::Disabled };              /**< Fast math hint */
+    Target                     target_hint             = {Target::UNSPECIFIED};        /**< Target execution hint */
+    ConvolutionMethod          convolution_method_hint = {ConvolutionMethod::Default}; /**< Convolution method hint */
+    DepthwiseConvolutionMethod depthwise_convolution_method_hint = {
+        DepthwiseConvolutionMethod::Default};               /**< Depthwise Convolution method hint */
+    FastMathHint fast_math_hint = {FastMathHint::Disabled}; /**< Fast math hint */
 };
 } // namespace frontend
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_STREAM_TYPES_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_STREAM_TYPES_H */
diff --git a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
index 14a427ba7c..61d8854a61 100644
--- a/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
+++ b/arm_compute/graph/mutators/DepthConcatSubTensorMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/GraphMutators.h b/arm_compute/graph/mutators/GraphMutators.h
index 6ae06990a4..155b3320c2 100644
--- a/arm_compute/graph/mutators/GraphMutators.h
+++ b/arm_compute/graph/mutators/GraphMutators.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/graph/mutators/GroupedConvolutionMutator.h b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
index 01c9d0ee2c..3ed8d786fc 100644
--- a/arm_compute/graph/mutators/GroupedConvolutionMutator.h
+++ b/arm_compute/graph/mutators/GroupedConvolutionMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/InPlaceOperationMutator.h b/arm_compute/graph/mutators/InPlaceOperationMutator.h
index 7932b6245c..86f62f1994 100644
--- a/arm_compute/graph/mutators/InPlaceOperationMutator.h
+++ b/arm_compute/graph/mutators/InPlaceOperationMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
index 3de940654b..505d4ab300 100644
--- a/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
+++ b/arm_compute/graph/mutators/NodeExecutionMethodMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,7 +42,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/NodeFusionMutator.h b/arm_compute/graph/mutators/NodeFusionMutator.h
index b99ee79f94..9d2d44f436 100644
--- a/arm_compute/graph/mutators/NodeFusionMutator.h
+++ b/arm_compute/graph/mutators/NodeFusionMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,7 +38,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
index c9747fd889..ab9746a29b 100644
--- a/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
+++ b/arm_compute/graph/mutators/SplitLayerSubTensorMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,7 +40,7 @@ public:
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
index 74f4b56acd..ce8af0a1d7 100644
--- a/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
+++ b/arm_compute/graph/mutators/SyntheticDataTypeMutator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,10 +35,15 @@ namespace graph
 class SyntheticDataTypeMutator final : public IGraphMutator
 {
 public:
+    // Constructor
+    SyntheticDataTypeMutator(DataType mutate_type = DataType::QASYMM8);
     // Inherited methods overridden
     virtual void mutate(Graph &g) override;
     MutationType type() const override;
-    const char *name() override;
+    const char  *name() override;
+
+private:
+    DataType _mutate_type;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/ActivationLayerNode.h b/arm_compute/graph/nodes/ActivationLayerNode.h
index 975bc8efb1..fe5f273db5 100644
--- a/arm_compute/graph/nodes/ActivationLayerNode.h
+++ b/arm_compute/graph/nodes/ActivationLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,8 +39,7 @@ public:
      * @param[in] info           Activation Layer information
      * @param[in] out_quant_info (Optional) Output quantization info
      */
-    ActivationLayerNode(ActivationLayerInfo info,
-                        QuantizationInfo    out_quant_info = QuantizationInfo());
+    ActivationLayerNode(ActivationLayerInfo info, QuantizationInfo out_quant_info = QuantizationInfo());
     /** Activation metadata accessor
      *
      * @return The activation info of the layer
@@ -51,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ActivationLayer;
diff --git a/arm_compute/graph/nodes/ArgMinMaxLayerNode.h b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
new file mode 100644
index 0000000000..65fbc36db6
--- /dev/null
+++ b/arm_compute/graph/nodes/ArgMinMaxLayerNode.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_ARGMINMAX_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_ARGMINMAX_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Arg Min/Max Layer node */
+class ArgMinMaxLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] op             Operation to perform: min or max
+     * @param[in] axis           Axis along which to reduce. Supported reduction axis : 0,1,2,3
+     * @param[in] out_data_type  (Optional) Output data type
+     * @param[in] out_quant_info (Optional) Output quantization info
+     */
+    ArgMinMaxLayerNode(ReductionOperation op,
+                       unsigned int       axis,
+                       DataType           out_data_type  = DataType::UNKNOWN,
+                       QuantizationInfo   out_quant_info = QuantizationInfo());
+    /** Operator accessor
+     *
+     * @return The operator the layer performs: min or max
+     */
+    ReductionOperation reduction_operation() const;
+    /** Axis accessor
+     *
+     * @return The axis along which the reduction is operating
+     */
+    unsigned int axis() const;
+    /** Output data type accessor
+     *
+     * @return The output data type
+     */
+    DataType out_data_type() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void             accept(INodeVisitor &v) override;
+
+public:
+    static constexpr NodeType node_type = NodeType::ArgMinMaxLayer;
+
+private:
+    ReductionOperation _op;
+    unsigned int       _axis;
+    DataType           _out_data_type;
+    QuantizationInfo   _out_quant_info;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_ARGMINMAX_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
index b50b9550f6..8583ed87eb 100644
--- a/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/BatchNormalizationLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,7 +60,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::BatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
index 062054ef47..96c2544065 100644
--- a/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
+++ b/arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     BoundingBoxTransformInfo _bbox_info;
diff --git a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
index 830efb906a..d296a2dcc3 100644
--- a/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
+++ b/arm_compute/graph/nodes/ChannelShuffleLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int _num_groups;
diff --git a/arm_compute/graph/nodes/ConcatenateLayerNode.h b/arm_compute/graph/nodes/ConcatenateLayerNode.h
index 77ca6f6770..13398b1a61 100644
--- a/arm_compute/graph/nodes/ConcatenateLayerNode.h
+++ b/arm_compute/graph/nodes/ConcatenateLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,8 @@ public:
      *
      * @return Expected output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors, DataLayoutDimension axis);
+    static TensorDescriptor compute_output_descriptor(const std::vector<TensorDescriptor> &input_descriptors,
+                                                      DataLayoutDimension                  axis);
     /** Disables or not the depth concatenate node
      *
      * @warning This is used when concatenate is performed using sub-tensors, where this node is used as a placeholder.
@@ -78,7 +79,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int                       _total_nodes;
diff --git a/arm_compute/graph/nodes/ConstNode.h b/arm_compute/graph/nodes/ConstNode.h
index 24dfaaae0b..400b9b4d9f 100644
--- a/arm_compute/graph/nodes/ConstNode.h
+++ b/arm_compute/graph/nodes/ConstNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorDescriptor _desc;
diff --git a/arm_compute/graph/nodes/ConvolutionLayerNode.h b/arm_compute/graph/nodes/ConvolutionLayerNode.h
index eea43d7470..8a77b89f27 100644
--- a/arm_compute/graph/nodes/ConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/ConvolutionLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -90,6 +90,11 @@ public:
      * @param[in] fused_activation Fused activation to set
      */
     void set_fused_activation(ActivationLayerInfo fused_activation);
+    /** Sets convolution info
+     *
+     * @param[in] info Convolution info to set
+     */
+    void set_convolution_info(PadStrideInfo info);
     /** Computes convolution output descriptor
      *
      * @param[in] input_descriptor   Input descriptor
@@ -106,7 +111,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::ConvolutionLayer;
diff --git a/arm_compute/graph/nodes/DeconvolutionLayerNode.h b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
index a5efdfb3bc..553d05985c 100644
--- a/arm_compute/graph/nodes/DeconvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DeconvolutionLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -61,7 +61,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     descriptors::DeconvolutionLayerDescriptor descriptor;
diff --git a/arm_compute/graph/nodes/UpsampleLayerNode.h b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
index cdaf206173..5fbcc670ff 100644
--- a/arm_compute/graph/nodes/UpsampleLayerNode.h
+++ b/arm_compute/graph/nodes/DepthToSpaceLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_UPSAMPLE_LAYER_NODE_H
-#define ARM_COMPUTE_GRAPH_UPSAMPLE_LAYER_NODE_H
+#ifndef ARM_COMPUTE_GRAPH_DEPTH_TO_SPACE_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_DEPTH_TO_SPACE_LAYER_NODE_H
 
 #include "arm_compute/graph/INode.h"
 
@@ -30,45 +30,37 @@ namespace arm_compute
 {
 namespace graph
 {
-/** Upsample Layer node */
-class UpsampleLayerNode final : public INode
+/** DepthToSpace Layer node */
+class DepthToSpaceLayerNode final : public INode
 {
 public:
-    /** Constructor
+    /** Default Constructor */
+    DepthToSpaceLayerNode(int block_shape);
+    /** Block shape policy accessor
      *
-     * @param[in] info              Stride info
-     * @param[in] upsampling_policy Upsampling policy
+     * @return Block shape
      */
-    UpsampleLayerNode(Size2D info, InterpolationPolicy upsampling_policy);
-    /** Stride info metadata accessor
+    int block_shape() const;
+    /** Computes depth to space output descriptor
      *
-     * @return The stride info of the layer
-     */
-    Size2D info() const;
-    /** Upsampling policy metadata accessor
-     *
-     * @return The upsampling policy of the layer
-     */
-    InterpolationPolicy upsampling_policy() const;
-    /** Computes upsample output descriptor
+     * @warning block_shape must be greater than or equal to 2
      *
      * @param[in] input_descriptor Input descriptor
-     * @param[in] info             Stride information
+     * @param[in] block_shape      Number of output neurons
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, Size2D info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
-    Size2D              _info;
-    InterpolationPolicy _upsampling_policy;
+    int _block_shape;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_UPSAMPLE_LAYER_NODE_H */
+#endif /* ARM_COMPUTE_GRAPH_DEPTH_TO_SPACE_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
index d8d36d9af2..441d68d2b8 100644
--- a/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
+++ b/arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -78,6 +78,11 @@ public:
      * @param[in] fused_activation Fused activation to set
      */
     void set_fused_activation(ActivationLayerInfo fused_activation);
+    /** Sets convolution info
+     *
+     * @param[in] info Convolution info to set
+     */
+    void set_convolution_info(PadStrideInfo info);
     /** Computes depthwise convolution output descriptor
      *
      * @param[in] input_descriptor   Input descriptor
@@ -96,7 +101,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::DepthwiseConvolutionLayer;
diff --git a/arm_compute/graph/nodes/DequantizationLayerNode.h b/arm_compute/graph/nodes/DequantizationLayerNode.h
index 8b3d4add65..1cce71373f 100644
--- a/arm_compute/graph/nodes/DequantizationLayerNode.h
+++ b/arm_compute/graph/nodes/DequantizationLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,8 +46,8 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DEQUANTIZATION_NODE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DEQUANTIZATION_NODE_H */
diff --git a/arm_compute/graph/nodes/DetectionOutputLayerNode.h b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
index 7732abfc79..c3e067e430 100644
--- a/arm_compute/graph/nodes/DetectionOutputLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,13 +51,14 @@ public:
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const DetectionOutputLayerInfo &info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor         &input_descriptor,
+                                                      const DetectionOutputLayerInfo &info);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     DetectionOutputLayerInfo _info;
diff --git a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
index 97d881d411..a53aaf2b9c 100644
--- a/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
+++ b/arm_compute/graph/nodes/DetectionPostProcessLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     DetectionPostProcessLayerInfo _info;
@@ -59,4 +59,4 @@ private:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DETECTION_POST_PROCESS_LAYER_NODE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DETECTION_POST_PROCESS_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/DummyNode.h b/arm_compute/graph/nodes/DummyNode.h
index e257641e8a..2263525a72 100644
--- a/arm_compute/graph/nodes/DummyNode.h
+++ b/arm_compute/graph/nodes/DummyNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,11 +51,11 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorShape _shape;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DUMMY_NODE_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_GRAPH_DUMMY_NODE_H */
diff --git a/arm_compute/graph/nodes/EltwiseLayerNode.h b/arm_compute/graph/nodes/EltwiseLayerNode.h
index d619ad2588..258298259f 100644
--- a/arm_compute/graph/nodes/EltwiseLayerNode.h
+++ b/arm_compute/graph/nodes/EltwiseLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -63,6 +63,12 @@ public:
      */
     ActivationLayerInfo fused_activation() const;
 
+    /** Returns output quantization info
+     *
+     * @return Output quantization info
+     */
+    QuantizationInfo output_quant_info() const;
+
     /** Sets fused activation
      *
      * @param[in] fused_activation Fused activation to set
@@ -73,13 +79,47 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::EltwiseLayer;
 
 private:
     descriptors::EltwiseLayerDescriptor descriptor;
 };
+
+/** Unary Eltwise Layer node */
+class UnaryEltwiseLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] descriptor Containing information for the node described in @ref descriptors::EltwiseLayerDescriptor
+     */
+    UnaryEltwiseLayerNode(const descriptors::UnaryEltwiseLayerDescriptor &descriptor);
+    /** Unary eltwise layer descriptor
+     *
+     * @return Unary eltwise layer descriptor which containing information
+     */
+    descriptors::UnaryEltwiseLayerDescriptor eltwise_descriptor() const;
+
+    /** Sets fused activation
+     *
+     * @param[in] fused_activation Fused activation to set
+     */
+    void set_fused_activation(ActivationLayerInfo fused_activation);
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void             accept(INodeVisitor &v) override;
+
+    static constexpr NodeType node_type = NodeType::UnaryEltwiseLayer;
+
+private:
+    descriptors::UnaryEltwiseLayerDescriptor descriptor;
+};
+
 } // namespace graph
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_GRAPH_ELTWISE_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/FlattenLayerNode.h b/arm_compute/graph/nodes/FlattenLayerNode.h
index fd9a525ea0..af104707a1 100644
--- a/arm_compute/graph/nodes/FlattenLayerNode.h
+++ b/arm_compute/graph/nodes/FlattenLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/FullyConnectedLayerNode.h b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
index 10c310dda2..3bcf386d64 100644
--- a/arm_compute/graph/nodes/FullyConnectedLayerNode.h
+++ b/arm_compute/graph/nodes/FullyConnectedLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,10 +39,22 @@ public:
      * @param[in] num_outputs    Number of neurons in the layer
      * @param[in] out_quant_info (Optional) Output quantization info
      * @param[in] fc_info        (Optional) Additional information about the fully connected layer
+     * @param[in] fast_math_hint (Optional) Fast math hint
      */
     FullyConnectedLayerNode(unsigned int            num_outputs,
                             QuantizationInfo        out_quant_info = QuantizationInfo(),
-                            FullyConnectedLayerInfo fc_info        = FullyConnectedLayerInfo());
+                            FullyConnectedLayerInfo fc_info        = FullyConnectedLayerInfo(),
+                            FastMathHint            fast_math_hint = FastMathHint::Disabled);
+    /** Sets the fast math fast hint
+     *
+     * @param[in] hint Hint to use for fullyconnected layer
+     */
+    void set_fast_math_hint(FastMathHint hint);
+    /** Fast math hint accessor
+     *
+     * @return Fast math hint to be used by the node
+     */
+    FastMathHint fast_math_hint() const;
     /** Sets fused activation
      *
      * @param[in] fused_activation Fused activation to set
@@ -61,7 +73,7 @@ public:
      */
     static TensorDescriptor compute_weights_descriptor(const TensorDescriptor &input_descriptor,
                                                        unsigned int            num_outputs,
-                                                       FullyConnectedLayerInfo fc_info            = FullyConnectedLayerInfo(),
+                                                       FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(),
                                                        const QuantizationInfo &weights_quant_info = QuantizationInfo());
     /** Computes fully connected layer output descriptor
      *
@@ -86,7 +98,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::FullyConnectedLayer;
 
@@ -94,6 +106,7 @@ private:
     unsigned int            _num_outputs;
     QuantizationInfo        _out_quant_info;
     FullyConnectedLayerInfo _info;
+    FastMathHint            _fast_math_hint;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
index 62ec55ee3b..d891ea49eb 100644
--- a/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,8 @@ public:
      * @param[in] fast_math_hint   (Optional) Fast math hint
      * @param[in] fused_activation (Optional) Fused activation layer. Disabled if not specified
      */
-    FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
+    FusedConvolutionBatchNormalizationNode(float               epsilon,
+                                           PadStrideInfo       info,
                                            unsigned int        num_groups       = 1,
                                            ConvolutionMethod   method           = ConvolutionMethod::Default,
                                            FastMathHint        fast_math_hint   = FastMathHint::Disabled,
@@ -100,7 +101,7 @@ public:
      */
     ConvolutionMethod convolution_method() const;
 
-    /** Sets the fast math fast hint
+    /** Sets the fast math hint
      *
      * @param[in] hint Hint to use for convolution
      */
@@ -122,7 +123,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::FusedConvolutionBatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
index 668f09e6b3..a61b155151 100644
--- a/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
+++ b/arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@ public:
                                                     PadStrideInfo              info,
                                                     unsigned int               depth_multiplier,
                                                     DepthwiseConvolutionMethod method,
-                                                    ActivationLayerInfo        fused_activation = ActivationLayerInfo());
+                                                    ActivationLayerInfo fused_activation = ActivationLayerInfo());
 
     /** Sets the depthwise convolution layer method to use
      *
@@ -117,7 +117,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer;
diff --git a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
index 57cadf44f8..b5e4b9781c 100644
--- a/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
+++ b/arm_compute/graph/nodes/GenerateProposalsLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     GenerateProposalsInfo _info;
diff --git a/arm_compute/graph/nodes/InputNode.h b/arm_compute/graph/nodes/InputNode.h
index 20fc27664b..0983d25a59 100644
--- a/arm_compute/graph/nodes/InputNode.h
+++ b/arm_compute/graph/nodes/InputNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorDescriptor _desc;
diff --git a/arm_compute/graph/nodes/L2NormalizeLayerNode.h b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
new file mode 100644
index 0000000000..ed11412b70
--- /dev/null
+++ b/arm_compute/graph/nodes/L2NormalizeLayerNode.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_L2_NORMALIZE_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_L2_NORMALIZE_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** L2Normalize Layer node */
+class L2NormalizeLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     */
+    L2NormalizeLayerNode();
+
+    /** Constructor
+     *
+     * @param[in] axis Axis to perform normalization on
+     */
+    L2NormalizeLayerNode(int axis);
+
+    /** Constructor
+     *
+     * @param[in] axis    Axis to perform normalization on
+     * @param[in] epsilon Lower bound value for the normalization
+     */
+    L2NormalizeLayerNode(int axis, float epsilon);
+
+    /** axis accessors
+     *
+     * @return Axis to perform normalization on
+     */
+    int axis() const;
+
+    /** epsilon accessors
+     *
+     * @return Lower bound value for the normalization
+     */
+    float epsilon() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void             accept(INodeVisitor &v) override;
+
+private:
+    int   _axis;
+    float _epsilon;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_L2_NORMALIZE_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index e8852d247a..d4ad32b6f0 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_NODES_H
-#define ARM_COMPUTE_GRAPH_NODES_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
 
 #include "arm_compute/graph/nodes/ActivationLayerNode.h"
+#include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
 #include "arm_compute/graph/nodes/BatchNormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
 #include "arm_compute/graph/nodes/ChannelShuffleLayerNode.h"
@@ -32,6 +33,7 @@
 #include "arm_compute/graph/nodes/ConstNode.h"
 #include "arm_compute/graph/nodes/ConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DeconvolutionLayerNode.h"
+#include "arm_compute/graph/nodes/DepthToSpaceLayerNode.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DequantizationLayerNode.h"
 #include "arm_compute/graph/nodes/DetectionOutputLayerNode.h"
@@ -44,25 +46,26 @@
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 #include "arm_compute/graph/nodes/InputNode.h"
+#include "arm_compute/graph/nodes/L2NormalizeLayerNode.h"
 #include "arm_compute/graph/nodes/NormalizationLayerNode.h"
 #include "arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h"
 #include "arm_compute/graph/nodes/OutputNode.h"
-#include "arm_compute/graph/nodes/PReluLayerNode.h"
 #include "arm_compute/graph/nodes/PadLayerNode.h"
 #include "arm_compute/graph/nodes/PermuteLayerNode.h"
 #include "arm_compute/graph/nodes/PoolingLayerNode.h"
+#include "arm_compute/graph/nodes/PReluLayerNode.h"
 #include "arm_compute/graph/nodes/PrintLayerNode.h"
 #include "arm_compute/graph/nodes/PriorBoxLayerNode.h"
 #include "arm_compute/graph/nodes/QuantizationLayerNode.h"
-#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
+#include "arm_compute/graph/nodes/ReductionLayerNode.h"
 #include "arm_compute/graph/nodes/ReorgLayerNode.h"
 #include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 #include "arm_compute/graph/nodes/ResizeLayerNode.h"
+#include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 #include "arm_compute/graph/nodes/SliceLayerNode.h"
 #include "arm_compute/graph/nodes/SoftmaxLayerNode.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
 #include "arm_compute/graph/nodes/StackLayerNode.h"
-#include "arm_compute/graph/nodes/UpsampleLayerNode.h"
-#include "arm_compute/graph/nodes/YOLOLayerNode.h"
+#include "arm_compute/graph/nodes/StridedSliceLayerNode.h"
 
-#endif /* ARM_COMPUTE_GRAPH_NODES_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODES_H
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index 0ccf2433b8..580f339468 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_NODES_FWD_H
-#define ARM_COMPUTE_GRAPH_NODES_FWD_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
+#define ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
 
 namespace arm_compute
 {
@@ -31,6 +31,7 @@ namespace graph
 // Forward declarations
 class INode;
 class ActivationLayerNode;
+class ArgMinMaxLayerNode;
 class BatchNormalizationLayerNode;
 class BoundingBoxTransformLayerNode;
 class ChannelShuffleLayerNode;
@@ -38,6 +39,7 @@ class ConcatenateLayerNode;
 class ConstNode;
 class ConvolutionLayerNode;
 class DeconvolutionLayerNode;
+class DepthToSpaceLayerNode;
 class DepthwiseConvolutionLayerNode;
 class DequantizationLayerNode;
 class DetectionOutputLayerNode;
@@ -50,6 +52,7 @@ class FusedConvolutionBatchNormalizationNode;
 class FusedDepthwiseConvolutionBatchNormalizationNode;
 class GenerateProposalsLayerNode;
 class InputNode;
+class L2NormalizeLayerNode;
 class NormalizationLayerNode;
 class NormalizePlanarYUVLayerNode;
 class OutputNode;
@@ -60,6 +63,7 @@ class PReluLayerNode;
 class PrintLayerNode;
 class PriorBoxLayerNode;
 class QuantizationLayerNode;
+class ReductionLayerNode;
 class ReorgLayerNode;
 class ReshapeLayerNode;
 class ResizeLayerNode;
@@ -68,8 +72,7 @@ class SoftmaxLayerNode;
 class SliceLayerNode;
 class SplitLayerNode;
 class StackLayerNode;
-class UpsampleLayerNode;
-class YOLOLayerNode;
+class StridedSliceLayerNode;
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_NODES_FWD_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_NODES_NODESFWD_H
diff --git a/arm_compute/graph/nodes/NormalizationLayerNode.h b/arm_compute/graph/nodes/NormalizationLayerNode.h
index 3f5e781511..86f2fb9dba 100644
--- a/arm_compute/graph/nodes/NormalizationLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizationLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     NormalizationLayerInfo _info;
diff --git a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
index 843ab9683d..158acc4c23 100644
--- a/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
+++ b/arm_compute/graph/nodes/NormalizePlanarYUVLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/OutputNode.h b/arm_compute/graph/nodes/OutputNode.h
index 27902af717..75484ab328 100644
--- a/arm_compute/graph/nodes/OutputNode.h
+++ b/arm_compute/graph/nodes/OutputNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/PReluLayerNode.h b/arm_compute/graph/nodes/PReluLayerNode.h
index f2055c2281..532fdccb3a 100644
--- a/arm_compute/graph/nodes/PReluLayerNode.h
+++ b/arm_compute/graph/nodes/PReluLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/PadLayerNode.h b/arm_compute/graph/nodes/PadLayerNode.h
index 852427a593..dcb5ea595b 100644
--- a/arm_compute/graph/nodes/PadLayerNode.h
+++ b/arm_compute/graph/nodes/PadLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,10 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
+
+public:
+    static constexpr NodeType node_type = NodeType::PadLayer;
 
 private:
     PaddingList _padding;
diff --git a/arm_compute/graph/nodes/PermuteLayerNode.h b/arm_compute/graph/nodes/PermuteLayerNode.h
index 555f2b5d16..62654e777c 100644
--- a/arm_compute/graph/nodes/PermuteLayerNode.h
+++ b/arm_compute/graph/nodes/PermuteLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PermutationVector _perm;
diff --git a/arm_compute/graph/nodes/PoolingLayerNode.h b/arm_compute/graph/nodes/PoolingLayerNode.h
index 41342a8215..c81f3f98dc 100644
--- a/arm_compute/graph/nodes/PoolingLayerNode.h
+++ b/arm_compute/graph/nodes/PoolingLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PoolingLayerInfo _info;
diff --git a/arm_compute/graph/nodes/PrintLayerNode.h b/arm_compute/graph/nodes/PrintLayerNode.h
index 78b7bf212c..e7accc8015 100644
--- a/arm_compute/graph/nodes/PrintLayerNode.h
+++ b/arm_compute/graph/nodes/PrintLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,9 @@ public:
      * @param[in] format_info (Optional) Format info.
      * @param[in] transform   (Optional) Input transform function.
      */
-    PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info = IOFormatInfo(), const std::function<ITensor *(ITensor *)> transform = nullptr);
+    PrintLayerNode(std::ostream                             &stream,
+                   const IOFormatInfo                       &format_info = IOFormatInfo(),
+                   const std::function<ITensor *(ITensor *)> transform   = nullptr);
 
     /** Stream metadata accessor
      *
@@ -67,7 +69,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     std::ostream                             &_stream;
diff --git a/arm_compute/graph/nodes/PriorBoxLayerNode.h b/arm_compute/graph/nodes/PriorBoxLayerNode.h
index f6cfa4779c..db36bfb1e0 100644
--- a/arm_compute/graph/nodes/PriorBoxLayerNode.h
+++ b/arm_compute/graph/nodes/PriorBoxLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,13 +51,14 @@ public:
      *
      * @return Output descriptor
      */
-    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const PriorBoxLayerInfo &info);
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor  &input_descriptor,
+                                                      const PriorBoxLayerInfo &info);
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     PriorBoxLayerInfo _info;
diff --git a/arm_compute/graph/nodes/QuantizationLayerNode.h b/arm_compute/graph/nodes/QuantizationLayerNode.h
index f283c26996..b8e4c7d27b 100644
--- a/arm_compute/graph/nodes/QuantizationLayerNode.h
+++ b/arm_compute/graph/nodes/QuantizationLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,16 +40,24 @@ public:
      */
     QuantizationLayerNode(QuantizationInfo out_quant_info);
 
+    /** Constructor
+     *
+     * @param[in] out_quant_info Output quantization info
+     * @param[in] out_data_type  Output data type
+     */
+    QuantizationLayerNode(QuantizationInfo out_quant_info, DataType out_data_type);
+
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
     static constexpr NodeType node_type = NodeType::QuantizationLayer;
 
 private:
     QuantizationInfo _out_quant_info;
+    DataType         _out_data_type;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/ROIAlignLayerNode.h b/arm_compute/graph/nodes/ROIAlignLayerNode.h
index 445246844b..70309a551c 100644
--- a/arm_compute/graph/nodes/ROIAlignLayerNode.h
+++ b/arm_compute/graph/nodes/ROIAlignLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,7 +56,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     ROIPoolingLayerInfo _pool_info;
diff --git a/arm_compute/graph/nodes/YOLOLayerNode.h b/arm_compute/graph/nodes/ReductionLayerNode.h
index 22e0cf5495..ff99466c8f 100644
--- a/arm_compute/graph/nodes/YOLOLayerNode.h
+++ b/arm_compute/graph/nodes/ReductionLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_YOLO_LAYER_NODE_H
-#define ARM_COMPUTE_GRAPH_YOLO_LAYER_NODE_H
+#ifndef ARM_COMPUTE_GRAPH_REDUCTION_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_REDUCTION_LAYER_NODE_H
 
 #include "arm_compute/graph/INode.h"
 
@@ -30,37 +30,39 @@ namespace arm_compute
 {
 namespace graph
 {
-/** YOLO Layer node */
-class YOLOLayerNode final : public INode
+/** Reduction Operation node */
+class ReductionLayerNode final : public INode
 {
 public:
-    /** Constructor
+    /** Default Constructor */
+    ReductionLayerNode(ReductionOperation op, unsigned int axis, bool keep_dims = true);
+    /** op accessor
      *
-     * @param[in] act_info    Activation info
-     * @param[in] num_classes Number of classes to activate
+     * @return op
      */
-    YOLOLayerNode(ActivationLayerInfo act_info, int32_t num_classes);
-    /** Activation metadata accessor
+    ReductionOperation op() const;
+    /** axis accessor
      *
-     * @return The activation info of the layer
+     * @return axis
      */
-    ActivationLayerInfo activation_info() const;
-    /** Number of classes metadata accessor
+    unsigned int axis() const;
+    /** keep_dims accessor
      *
-     * @return The number of classes to activate of the layer
+     * @return keep_dims
      */
-    int32_t num_classes() const;
+    bool keep_dims() const;
 
     // Inherited overridden methods:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
-    ActivationLayerInfo _act_info;
-    int32_t             _num_classes;
+    ReductionOperation _op;
+    unsigned int       _axis;
+    bool               _keep_dims;
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_YOLO_LAYER_NODE_H */
+#endif /* ARM_COMPUTE_GRAPH_REDUCTION_LAYER_NODE_H */
diff --git a/arm_compute/graph/nodes/ReorgLayerNode.h b/arm_compute/graph/nodes/ReorgLayerNode.h
index 86f62522c3..a3bbcdb00f 100644
--- a/arm_compute/graph/nodes/ReorgLayerNode.h
+++ b/arm_compute/graph/nodes/ReorgLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -57,7 +57,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     int _stride;
diff --git a/arm_compute/graph/nodes/ReshapeLayerNode.h b/arm_compute/graph/nodes/ReshapeLayerNode.h
index 57d399ac2f..992275c2b1 100644
--- a/arm_compute/graph/nodes/ReshapeLayerNode.h
+++ b/arm_compute/graph/nodes/ReshapeLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,7 +44,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     TensorShape _shape;
diff --git a/arm_compute/graph/nodes/ResizeLayerNode.h b/arm_compute/graph/nodes/ResizeLayerNode.h
index 93b449502c..480d6e517f 100644
--- a/arm_compute/graph/nodes/ResizeLayerNode.h
+++ b/arm_compute/graph/nodes/ResizeLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,7 +51,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     InterpolationPolicy _policy;
diff --git a/arm_compute/graph/nodes/SliceLayerNode.h b/arm_compute/graph/nodes/SliceLayerNode.h
index 7f3173e183..63f266b217 100644
--- a/arm_compute/graph/nodes/SliceLayerNode.h
+++ b/arm_compute/graph/nodes/SliceLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -41,7 +41,7 @@ public:
      * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    SliceLayerNode(Coordinates &starts, Coordinates &ends);
+    SliceLayerNode(const Coordinates &starts, const Coordinates &ends);
     /** Computes slice layer output descriptor
      *
      * @param[in] input_descriptor Descriptor of the input tensor
@@ -51,7 +51,8 @@ public:
      * @return  Output descriptor
      */
     static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                      const Coordinates &starts, const Coordinates &ends);
+                                                      const Coordinates      &starts,
+                                                      const Coordinates      &ends);
     /** Start coordinates accessor
      *
      * @return Start coordinates of the dimensions
@@ -67,7 +68,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     Coordinates _starts;
diff --git a/arm_compute/graph/nodes/SoftmaxLayerNode.h b/arm_compute/graph/nodes/SoftmaxLayerNode.h
index cbcd06a477..2cb1ac2cf4 100644
--- a/arm_compute/graph/nodes/SoftmaxLayerNode.h
+++ b/arm_compute/graph/nodes/SoftmaxLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 public:
     static constexpr NodeType node_type = NodeType::SoftmaxLayer;
diff --git a/arm_compute/graph/nodes/SplitLayerNode.h b/arm_compute/graph/nodes/SplitLayerNode.h
index 345260ab84..5e6df53c0f 100644
--- a/arm_compute/graph/nodes/SplitLayerNode.h
+++ b/arm_compute/graph/nodes/SplitLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,10 +38,13 @@ class SplitLayerNode final : public INode
 public:
     /** Default Constructor
      *
-     * @param[in] num_splits Number of splits
-     * @param[in] axis       (Optional) Axis to split on. Supported axis >= 2. Defaults to 0
+     * @param[in] num_splits  Number of splits
+     * @param[in] axis        (Optional) Axis to split on. Defaults to 0
+     * @param[in] size_splits (Optional) The sizes of each output tensor along the split dimension.
+     *                        Must sum to the dimension of value along split_dim.
+     *                        Can contain one -1 indicating that dimension is to be inferred.
      */
-    SplitLayerNode(unsigned int num_splits, unsigned int axis = 0);
+    SplitLayerNode(unsigned int num_splits, int axis = 0, std::vector<int> size_splits = std::vector<int>());
     /** Computes split layer output descriptor
      *
      * @param[in] input_descriptor Descriptor of the input tensor
@@ -51,8 +54,10 @@ public:
      *
      * @return  A pair with the descriptor of the split and the starting coordinates
      */
-    static std::pair<TensorDescriptor, Coordinates> compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                              unsigned int num_splits, unsigned int axis, unsigned int idx);
+    std::pair<TensorDescriptor, Coordinates> compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                       unsigned int            num_splits,
+                                                                       int                     axis,
+                                                                       unsigned int            idx);
     /** Number of splits accessor
      *
      * @return Number of splits
@@ -69,11 +74,12 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
-    unsigned int _num_splits;
-    unsigned int _axis;
+    unsigned int     _num_splits;
+    int              _axis;
+    std::vector<int> _size_splits;
 };
 } // namespace graph
 } // namespace arm_compute
diff --git a/arm_compute/graph/nodes/StackLayerNode.h b/arm_compute/graph/nodes/StackLayerNode.h
index 52632f5577..9f0767c9f2 100644
--- a/arm_compute/graph/nodes/StackLayerNode.h
+++ b/arm_compute/graph/nodes/StackLayerNode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ public:
     NodeType         type() const override;
     bool             forward_descriptors() override;
     TensorDescriptor configure_output(size_t idx) const override;
-    void accept(INodeVisitor &v) override;
+    void             accept(INodeVisitor &v) override;
 
 private:
     unsigned int _total_nodes;
diff --git a/arm_compute/graph/nodes/StridedSliceLayerNode.h b/arm_compute/graph/nodes/StridedSliceLayerNode.h
new file mode 100644
index 0000000000..f521feb780
--- /dev/null
+++ b/arm_compute/graph/nodes/StridedSliceLayerNode.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_GRAPH_STRIDED_SLICE_LAYER_NODE_H
+#define ARM_COMPUTE_GRAPH_STRIDED_SLICE_LAYER_NODE_H
+
+#include "arm_compute/graph/INode.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Slice Layer node */
+class StridedSliceLayerNode final : public INode
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] starts             The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends               The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides            The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strided_slice_info Contains masks for the starts, ends and strides
+     */
+    StridedSliceLayerNode(const Coordinates    &starts,
+                          const Coordinates    &ends,
+                          const BiStrides      &strides,
+                          StridedSliceLayerInfo strided_slice_info);
+    /** Computes slice layer output descriptor
+     *
+     * @param[in] input_descriptor Descriptor of the input tensor
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] info             Contains masks for the starts, ends and strides
+     *
+     * @return  Output descriptor
+     */
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                      const Coordinates      &starts,
+                                                      const Coordinates      &ends,
+                                                      const BiStrides        &strides,
+                                                      StridedSliceLayerInfo   info);
+    /** Start coordinates accessor
+     *
+     * @return Start coordinates of the dimensions
+     */
+    Coordinates starts() const;
+    /** End coordinates accessor
+     *
+     * @return End coordinates of the dimensions
+     */
+    Coordinates ends() const;
+    /** Strides vector accessor
+     *
+     * @return End coordinates of the dimensions
+     */
+    BiStrides strides() const;
+
+    StridedSliceLayerInfo strided_slice_info() const;
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void             accept(INodeVisitor &v) override;
+
+private:
+    Coordinates           _starts;
+    Coordinates           _ends;
+    BiStrides             _strides;
+    StridedSliceLayerInfo _info;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_GRAPH_STRIDED_SLICE_LAYER_NODE_H */
diff --git a/arm_compute/graph/printers/DotGraphPrinter.h b/arm_compute/graph/printers/DotGraphPrinter.h
index c763cb1a96..6638033044 100644
--- a/arm_compute/graph/printers/DotGraphPrinter.h
+++ b/arm_compute/graph/printers/DotGraphPrinter.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019,2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,11 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
-#define ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H
+#ifndef ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
+#define ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
 
 #include "arm_compute/graph/IGraphPrinter.h"
-
 #include "arm_compute/graph/INodeVisitor.h"
 
 #include <string>
@@ -60,7 +59,7 @@ public:
     void visit(FusedDepthwiseConvolutionBatchNormalizationNode &n) override;
     void visit(NormalizationLayerNode &n) override;
     void visit(PoolingLayerNode &n) override;
-    void default_visit() override;
+    void default_visit(INode &n) override;
 
 private:
     std::string _info{};
@@ -104,4 +103,4 @@ private:
 };
 } // namespace graph
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_GRAPH_DOTGRAPHPRINTER_H */
+#endif // ACL_ARM_COMPUTE_GRAPH_PRINTERS_DOTGRAPHPRINTER_H
diff --git a/arm_compute/graph/printers/Printers.h b/arm_compute/graph/printers/Printers.h
index 631b63428e..81ecce7227 100644
--- a/arm_compute/graph/printers/Printers.h
+++ b/arm_compute/graph/printers/Printers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/Allocator.h b/arm_compute/runtime/Allocator.h
index db8818df82..e99ddb3dac 100644
--- a/arm_compute/runtime/Allocator.h
+++ b/arm_compute/runtime/Allocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ALLOCATOR_H
 
 #include "arm_compute/runtime/IAllocator.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
@@ -40,9 +39,9 @@ public:
     Allocator() = default;
 
     // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
+    void                          *allocate(size_t size, size_t alignment) override;
+    void                           free(void *ptr) override;
     std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ALLOCATOR_H */
diff --git a/arm_compute/runtime/Array.h b/arm_compute/runtime/Array.h
index bbf23955d4..9283273317 100644
--- a/arm_compute/runtime/Array.h
+++ b/arm_compute/runtime/Array.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/IArray.h"
 #include "arm_compute/core/Types.h"
-#include "support/MemorySupport.h"
 
 #include <memory>
 
@@ -38,16 +37,14 @@ class Array : public IArray<T>
 {
 public:
     /** Default constructor: empty array */
-    Array()
-        : IArray<T>(0), _values(nullptr)
+    Array() : IArray<T>(0), _values(nullptr)
     {
     }
     /** Constructor: initializes an array which can contain up to max_num_points values
      *
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
-    Array(size_t max_num_values)
-        : IArray<T>(max_num_values), _values(arm_compute::support::cpp14::make_unique<T[]>(max_num_values))
+    Array(size_t max_num_values) : IArray<T>(max_num_values), _values(std::make_unique<T[]>(max_num_values))
     {
     }
 
@@ -61,14 +58,6 @@ private:
     std::unique_ptr<T[]> _values;
 };
 
-/** Array of Key Points. */
-using KeyPointArray = Array<KeyPoint>;
-/** Array of 2D Coordinates. */
-using Coordinates2DArray = Array<Coordinates2D>;
-/** Array of Detection Windows. */
-using DetectionWindowArray = Array<DetectionWindow>;
-/** Array of 2D Sizes. */
-using Size2DArray = Array<Size2D>;
 /** Array of uint8s. */
 using UInt8Array = Array<uint8_t>;
 /** Array of uint16s. */
@@ -81,5 +70,5 @@ using Int16Array = Array<int16_t>;
 using Int32Array = Array<int32_t>;
 /** Array of floats. */
 using FloatArray = Array<float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_ARRAY_H */
diff --git a/arm_compute/runtime/BlobLifetimeManager.h b/arm_compute/runtime/BlobLifetimeManager.h
index 7ef93eb8f8..18ffe96ee5 100644
--- a/arm_compute/runtime/BlobLifetimeManager.h
+++ b/arm_compute/runtime/BlobLifetimeManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_BLOBLIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
 #include "arm_compute/runtime/Types.h"
 
 #include <memory>
@@ -62,7 +61,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    MappingType mapping_type() const override;
+    MappingType                  mapping_type() const override;
 
 private:
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/BlobMemoryPool.h b/arm_compute/runtime/BlobMemoryPool.h
index 755694d4ef..b25efc3821 100644
--- a/arm_compute/runtime/BlobMemoryPool.h
+++ b/arm_compute/runtime/BlobMemoryPool.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_BLOBMEMORYPOOL_H
 
 #include "arm_compute/runtime/IMemoryPool.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -62,8 +61,8 @@ public:
     BlobMemoryPool &operator=(BlobMemoryPool &&) = default;
 
     // Inherited methods overridden:
-    void acquire(MemoryMappings &handles) override;
-    void release(MemoryMappings &handles) override;
+    void                         acquire(MemoryMappings &handles) override;
+    void                         release(MemoryMappings &handles) override;
     MappingType                  mapping_type() const override;
     std::unique_ptr<IMemoryPool> duplicate() override;
 
diff --git a/arm_compute/runtime/CL/CLArray.h b/arm_compute/runtime/CL/CLArray.h
index 62b61719d4..6e81a46a29 100644
--- a/arm_compute/runtime/CL/CLArray.h
+++ b/arm_compute/runtime/CL/CLArray.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -38,8 +38,7 @@ class CLArray : public ICLArray<T>
 {
 public:
     /** Default constructor: empty array */
-    CLArray()
-        : ICLArray<T>(0), _buffer()
+    CLArray() : ICLArray<T>(0), _buffer()
     {
     }
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -55,7 +54,8 @@ public:
      * @param[in] max_num_values Maximum number of values the array will be able to stored
      */
     CLArray(size_t max_num_values)
-        : ICLArray<T>(max_num_values), _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
+        : ICLArray<T>(max_num_values),
+          _buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, max_num_values * sizeof(T))
     {
     }
     /** Enqueue a map operation of the allocated buffer.
@@ -91,7 +91,8 @@ protected:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override
     {
         ARM_COMPUTE_ERROR_ON(nullptr == _buffer.get());
-        return static_cast<uint8_t *>(q.enqueueMapBuffer(_buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
+        return static_cast<uint8_t *>(q.enqueueMapBuffer(
+            _buffer, blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, 0, this->max_num_values() * sizeof(T)));
     }
     void do_unmap(cl::CommandQueue &q, uint8_t *mapping) override
     {
@@ -102,15 +103,6 @@ protected:
 private:
     cl::Buffer _buffer;
 };
-
-/** OpenCL Array of Key Points. */
-using CLKeyPointArray = CLArray<KeyPoint>;
-/** OpenCL Array of 2D Coordinates. */
-using CLCoordinates2DArray = CLArray<Coordinates2D>;
-/** OpenCL Array of Detection Windows. */
-using CLDetectionWindowArray = CLArray<DetectionWindow>;
-/** OpenCL Array of 2D Sizes. */
-using CLSize2DArray = CLArray<Size2D>;
 /** OpenCL Array of uint8s. */
 using CLUInt8Array = CLArray<cl_uchar>;
 /** OpenCL Array of uint16s. */
@@ -123,5 +115,5 @@ using CLInt16Array = CLArray<cl_short>;
 using CLInt32Array = CLArray<cl_int>;
 /** OpenCL Array of floats. */
 using CLFloatArray = CLArray<cl_float>;
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLARRAY_H */
diff --git a/arm_compute/runtime/CL/CLBufferAllocator.h b/arm_compute/runtime/CL/CLBufferAllocator.h
index f6e1b96caa..00ff017012 100644
--- a/arm_compute/runtime/CL/CLBufferAllocator.h
+++ b/arm_compute/runtime/CL/CLBufferAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,29 +30,14 @@
 
 namespace arm_compute
 {
-class CLCoreRuntimeContext;
 /** Default OpenCL cl buffer allocator implementation */
 class CLBufferAllocator final : public IAllocator
 {
 public:
-    /** Default constructor
-     *
-     * @param[in] ctx A runtime context.
-     */
-    CLBufferAllocator(CLCoreRuntimeContext *ctx = nullptr);
-
-    /** Default copy constructor */
-    CLBufferAllocator(const CLBufferAllocator &) = default;
-    /** Default copy assignment operator */
-    CLBufferAllocator &operator=(const CLBufferAllocator &) = default;
-
     // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
+    void                          *allocate(size_t size, size_t alignment) override;
+    void                           free(void *ptr) override;
     std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
-
-private:
-    CLCoreRuntimeContext *_ctx;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLBUFFERALLOCATOR_H */
diff --git a/arm_compute/runtime/CL/CLDistribution1D.h b/arm_compute/runtime/CL/CLDistribution1D.h
deleted file mode 100644
index 7d832e99f6..0000000000
--- a/arm_compute/runtime/CL/CLDistribution1D.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDISTRIBUTION1D_H
-#define ARM_COMPUTE_CLDISTRIBUTION1D_H
-
-#include "arm_compute/core/CL/ICLDistribution1D.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-/** CLDistribution1D object class */
-class CLDistribution1D : public ICLDistribution1D
-{
-public:
-    /** Constructor: Creates a 1D CLDistribution of a consecutive interval [offset, offset + range - 1]
-     *               defined by a start offset and valid range, divided equally into num_bins parts.
-     *
-     * @param[in] num_bins The number of bins the distribution is divided in.
-     * @param[in] offset   The start of the values to use.
-     * @param[in] range    The total number of the consecutive values of the distribution interval.
-     */
-    CLDistribution1D(size_t num_bins, int32_t offset, uint32_t range);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDistribution1D(const CLDistribution1D &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDistribution1D &operator=(const CLDistribution1D &) = delete;
-    /** Enqueue a map operation of the allocated buffer.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed.
-     */
-    void map(bool blocking = true);
-    using ICLDistribution1D::map;
-    /** Enqueue an unmap operation of the allocated and mapped buffer.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     */
-    void unmap();
-    using ICLDistribution1D::unmap;
-
-    // Inherited methods overridden:
-    cl::Buffer &cl_buffer() override;
-
-protected:
-    // Inherited methods overridden:
-    uint32_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
-
-private:
-    cl::Buffer _mem;
-};
-}
-#endif /* ARM_COMPUTE_CLDISTRIBUTION1D_H */
diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h
index 007a40c651..a09ca551d2 100644
--- a/arm_compute/runtime/CL/CLFunctions.h
+++ b/arm_compute/runtime/CL/CLFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,12 +21,10 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLFUNCTIONS_H
-#define ARM_COMPUTE_CLFUNCTIONS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
 
 /* Header regrouping all the CL functions */
-#include "arm_compute/runtime/CL/functions/CLAbsoluteDifference.h"
-#include "arm_compute/runtime/CL/functions/CLAccumulate.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h"
@@ -36,20 +34,15 @@
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
-#include "arm_compute/runtime/CL/functions/CLBox3x3.h"
-#include "arm_compute/runtime/CL/functions/CLCannyEdge.h"
 #include "arm_compute/runtime/CL/functions/CLCast.h"
-#include "arm_compute/runtime/CL/functions/CLChannelCombine.h"
-#include "arm_compute/runtime/CL/functions/CLChannelExtract.h"
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
-#include "arm_compute/runtime/CL/functions/CLColorConvert.h"
 #include "arm_compute/runtime/CL/functions/CLComparison.h"
-#include "arm_compute/runtime/CL/functions/CLComputeAllAnchors.h"
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLConv3D.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/CL/functions/CLConvolution.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLCrop.h"
 #include "arm_compute/runtime/CL/functions/CLCropResize.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
@@ -57,100 +50,69 @@
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-#include "arm_compute/runtime/CL/functions/CLDilate.h"
 #include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h"
-#include "arm_compute/runtime/CL/functions/CLErode.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
 #include "arm_compute/runtime/CL/functions/CLFFT2D.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFastCorners.h"
 #include "arm_compute/runtime/CL/functions/CLFill.h"
-#include "arm_compute/runtime/CL/functions/CLFillBorder.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFloor.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h"
+#include "arm_compute/runtime/CL/functions/CLGather.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/CL/functions/CLGather.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian3x3.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
 #include "arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDescriptor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-#include "arm_compute/runtime/CL/functions/CLHOGMultiDetection.h"
-#include "arm_compute/runtime/CL/functions/CLHarrisCorners.h"
-#include "arm_compute/runtime/CL/functions/CLHistogram.h"
+#include "arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLIntegralImage.h"
 #include "arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalNot.h"
+#include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h"
-#include "arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLMagnitude.h"
-#include "arm_compute/runtime/CL/functions/CLMeanStdDev.h"
+#include "arm_compute/runtime/CL/functions/CLMatMul.h"
+#include "arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLMedian3x3.h"
-#include "arm_compute/runtime/CL/functions/CLMinMaxLocation.h"
-#include "arm_compute/runtime/CL/functions/CLNonLinearFilter.h"
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
 #include "arm_compute/runtime/CL/functions/CLNormalizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
-#include "arm_compute/runtime/CL/functions/CLOpticalFlow.h"
-#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
-#include "arm_compute/runtime/CL/functions/CLPhase.h"
 #include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLPooling3dLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "arm_compute/runtime/CL/functions/CLPriorBoxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQLSTMLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
-#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
-#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLRange.h"
 #include "arm_compute/runtime/CL/functions/CLReduceMean.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-#include "arm_compute/runtime/CL/functions/CLRemap.h"
 #include "arm_compute/runtime/CL/functions/CLReorgLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLRNNLayer.h"
+#include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
+#include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
+#include "arm_compute/runtime/CL/functions/CLScatter.h"
 #include "arm_compute/runtime/CL/functions/CLSelect.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
-#include "arm_compute/runtime/CL/functions/CLSobel3x3.h"
-#include "arm_compute/runtime/CL/functions/CLSobel5x5.h"
-#include "arm_compute/runtime/CL/functions/CLSobel7x7.h"
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSplit.h"
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
-#include "arm_compute/runtime/CL/functions/CLTableLookup.h"
-#include "arm_compute/runtime/CL/functions/CLThreshold.h"
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
 #include "arm_compute/runtime/CL/functions/CLUnstack.h"
-#include "arm_compute/runtime/CL/functions/CLUpsampleLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWarpAffine.h"
-#include "arm_compute/runtime/CL/functions/CLWarpPerspective.h"
 #include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
-#include "arm_compute/runtime/CL/functions/CLYOLOLayer.h"
 
-#endif /* ARM_COMPUTE_CLFUNCTIONS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_CLFUNCTIONS_H
diff --git a/arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h b/arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h
new file mode 100644
index 0000000000..d81c55be10
--- /dev/null
+++ b/arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_CL_CL_GEMM_HEURISTICS_HANDLE_H
+#define ARM_COMPUTE_RUNTIME_CL_CL_GEMM_HEURISTICS_HANDLE_H
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace mlgo
+{
+/** Forward declaration for the underlying heuristics: MLGOHeuristics */
+class MLGOHeuristics;
+} // namespace mlgo
+
+/** Handle for loading and retrieving GEMM heuristics */
+class CLGEMMHeuristicsHandle
+{
+public:
+    /** Constructor */
+    CLGEMMHeuristicsHandle();
+    /** Destructor */
+    ~CLGEMMHeuristicsHandle();
+    /** Prevent Copy Construct */
+    CLGEMMHeuristicsHandle(const CLGEMMHeuristicsHandle &) = delete;
+    /** Prevent Copy Assignment */
+    CLGEMMHeuristicsHandle &operator=(const CLGEMMHeuristicsHandle &) = delete;
+    /** Default Move Constructor */
+    CLGEMMHeuristicsHandle(CLGEMMHeuristicsHandle &&) = default;
+    /** Default Move Assignment */
+    CLGEMMHeuristicsHandle &operator=(CLGEMMHeuristicsHandle &&) = default;
+    /** (Re)Load the heuristics from reading a dotmlgo file
+     *
+     * @param[in] filename Path to the dotmlgo file
+     *
+     * @return bool Signals if the reload succeeded or failed
+     */
+    bool reload_from_file(const std::string &filename);
+    /** Return a pointer to underlying heuristics for querying purposes
+     *
+     * @return MLGOHeuristics*
+     */
+    const mlgo::MLGOHeuristics *get() const;
+
+private:
+    std::unique_ptr<mlgo::MLGOHeuristics> _heuristics; /**< Pointer to underlying heuristics */
+};
+
+} // namespace arm_compute
+
+#endif // ARM_COMPUTE_RUNTIME_CL_CL_GEMM_HEURISTICS_HANDLE_H
+\ No newline at end of file
diff --git a/arm_compute/runtime/CL/CLHOG.h b/arm_compute/runtime/CL/CLHOG.h
deleted file mode 100644
index a96f52a766..0000000000
--- a/arm_compute/runtime/CL/CLHOG.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOG_H
-#define ARM_COMPUTE_CLHOG_H
-
-#include "arm_compute/core/CL/ICLHOG.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-/** OpenCL implementation of HOG data-object */
-class CLHOG : public ICLHOG
-{
-public:
-    /** Default constructor */
-    CLHOG();
-    /** Allocate the HOG descriptor using the given HOG's metadata
-     *
-     * @param[in] input HOG's metadata used to allocate the HOG descriptor
-     */
-    void init(const HOGInfo &input);
-
-    /** Enqueue a map operation of the allocated buffer.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed.
-     */
-    void map(bool blocking = true);
-    using ICLHOG::map;
-
-    /** Enqueue an unmap operation of the allocated and mapped buffer.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     */
-    void unmap();
-    using ICLHOG::unmap;
-
-    // Inherited method overridden:
-    void              free() override;
-    const HOGInfo    *info() const override;
-    const cl::Buffer &cl_buffer() const override;
-
-protected:
-    // Inherited methods overridden:
-    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
-
-private:
-    HOGInfo    _info;
-    cl::Buffer _buffer;
-};
-}
-#endif /* ARM_COMPUTE_CLHOG_H */
diff --git a/arm_compute/runtime/CL/CLHelpers.h b/arm_compute/runtime/CL/CLHelpers.h
index 716d95e7a3..fef2619071 100644
--- a/arm_compute/runtime/CL/CLHelpers.h
+++ b/arm_compute/runtime/CL/CLHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_HELPERS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/IScheduler.h"
 
 namespace arm_compute
@@ -37,11 +38,13 @@ class ICLKernel;
  *
  * @note In debug builds, the function will automatically enable cl_arm_printf if the driver/device supports it.
  *
+ * @param[in] cl_backend_type The OpenCL backend type to use.
+ *
  * @return A std::tuple where the first element is the opencl context, the second element is the opencl device
  *         and the third one an error code. The error code will be CL_SUCCESS upon successful creation, otherwise
  *         a value telling why the function failed.
  */
-std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device();
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type);
 /** Schedules a kernel using the context if not nullptr else uses the legacy scheduling flow.
  *
  * @param[in] ctx    Context to use.
@@ -49,5 +52,13 @@ std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device();
  * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
  */
 void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush = true);
+
+/** This function selects the OpenCL platform based on the backend type.
+ *
+ * @param[in] cl_backend_type The OpenCL backend type to use.
+ *
+ * @return A cl::Platform object.
+ */
+cl::Platform select_preferable_platform(CLBackendType cl_backend_type);
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_HELPERS_H */
diff --git a/arm_compute/runtime/CL/CLLut.h b/arm_compute/runtime/CL/CLLut.h
deleted file mode 100644
index a38928449f..0000000000
--- a/arm_compute/runtime/CL/CLLut.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLUT_H
-#define ARM_COMPUTE_CLLUT_H
-
-#include "arm_compute/core/CL/ICLLut.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLLutAllocator.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ILutAllocator;
-
-/** Basic implementation of the OpenCL lut interface */
-class CLLut : public ICLLut
-{
-public:
-    /** Constructor */
-    CLLut();
-    /** Constructor: initializes a LUT which can contain num_values values of data_type type.
-     *
-     * @param[in] num_elements Number of elements of the LUT.
-     * @param[in] data_type    Data type of each element.
-     */
-    CLLut(size_t num_elements, DataType data_type);
-    /** Return a pointer to the lut's allocator
-     *
-     * @return A pointer to the lut's allocator
-     */
-    ILutAllocator *allocator();
-    /** Enqueue a map operation of the allocated buffer.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed.
-     */
-    void map(bool blocking = true);
-    using ICLLut::map;
-    /** Enqueue an unmap operation of the allocated and mapped buffer.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     */
-    void unmap();
-    using ICLLut::unmap;
-
-    // Inherited methods overridden:
-    size_t            num_elements() const override;
-    uint32_t          index_offset() const override;
-    size_t            size_in_bytes() const override;
-    DataType          type() const override;
-    const cl::Buffer &cl_buffer() const override;
-    void              clear() override;
-
-protected:
-    // Inherited methods overridden:
-    uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
-
-private:
-    CLLutAllocator _allocator; /**< Instance of the OpenCL lut allocator */
-};
-}
-#endif /*ARM_COMPUTE_CLLUT_H */
diff --git a/arm_compute/runtime/CL/CLLutAllocator.h b/arm_compute/runtime/CL/CLLutAllocator.h
deleted file mode 100644
index 50b905f297..0000000000
--- a/arm_compute/runtime/CL/CLLutAllocator.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLUTALLOCATOR_H
-#define ARM_COMPUTE_CLLUTALLOCATOR_H
-
-#include "arm_compute/runtime/ILutAllocator.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Basic implementation of a CL memory LUT allocator. */
-class CLLutAllocator : public ILutAllocator
-{
-public:
-    /** Default constructor. */
-    CLLutAllocator();
-    /** Default destructor. */
-    ~CLLutAllocator() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLutAllocator(const CLLutAllocator &) = delete;
-    /** Prevent instances of this class from being copy assigned (As this class contains pointers) */
-    const CLLutAllocator &operator=(const CLLutAllocator &) = delete;
-    /** Interface to be implemented by the child class to return the pointer to the mapped data.
-     *
-     * @return pointer to the mapped data.
-     */
-    uint8_t *data();
-    /** Interface to be implemented by the child class to return the pointer to the CL data.
-     *
-     * @return pointer to the CL data.
-     */
-    const cl::Buffer &cl_data() const;
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in,out] q        The CL command queue to use for the mapping operation.
-     * @param[in]     blocking If true, then the mapping will be ready to use by the time
-     *                         this method returns, else it is the caller's responsibility
-     *                         to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
-     */
-    uint8_t *map(cl::CommandQueue &q, bool blocking);
-    /** Enqueue an unmap operation of the allocated buffer on the given queue.
-     *
-     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     * @param[in,out] q       The CL command queue to use for the mapping operation.
-     * @param[in]     mapping The cpu mapping to unmap.
-     */
-    void unmap(cl::CommandQueue &q, uint8_t *mapping);
-
-protected:
-    /** Allocate num_elements() * sizeof(type()) of OpenCL memory. */
-    void allocate() override;
-    /** Call map() on the OpenCL buffer.
-     *
-     * @return A pointer to the beginning of the LUT's allocation.
-     */
-    uint8_t *lock() override;
-    /** Call unmap() on the OpenCL buffer. */
-    void unlock() override;
-
-private:
-    cl::Buffer _buffer;  /**< OpenCL buffer containing the LUT data. */
-    uint8_t   *_mapping; /**< Pointer to the CPU mapping of the OpenCL buffer. */
-};
-}
-
-#endif /* ARM_COMPUTE_CLLUTALLOCATOR_H */
diff --git a/arm_compute/runtime/CL/CLMemory.h b/arm_compute/runtime/CL/CLMemory.h
index 5842ddf512..5abe86bd53 100644
--- a/arm_compute/runtime/CL/CLMemory.h
+++ b/arm_compute/runtime/CL/CLMemory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
 #define ARM_COMPUTE_RUNTIME_CL_CLMEMORY_H
 
-#include "arm_compute/runtime/IMemory.h"
-
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLMemoryRegion.h"
+#include "arm_compute/runtime/IMemory.h"
 
 #include <cstddef>
 #include <memory>
@@ -75,8 +74,8 @@ public:
     // Inherited methods overridden:
     IMemoryRegion *region() final;
     IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+    void           set_region(IMemoryRegion *region) final;
+    void           set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
 
 private:
     ICLMemoryRegion                 *_region;
diff --git a/arm_compute/runtime/CL/CLMemoryRegion.h b/arm_compute/runtime/CL/CLMemoryRegion.h
index 02ccc6b139..365973a9e6 100644
--- a/arm_compute/runtime/CL/CLMemoryRegion.h
+++ b/arm_compute/runtime/CL/CLMemoryRegion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,17 +31,15 @@
 
 namespace arm_compute
 {
-class CLCoreRuntimeContext;
 /** OpenCL memory region interface */
 class ICLMemoryRegion : public IMemoryRegion
 {
 public:
     /** Constructor
      *
-     * @param[in] ctx  Runtime context
      * @param[in] size Region size
      */
-    ICLMemoryRegion(CLCoreRuntimeContext *ctx, size_t size);
+    ICLMemoryRegion(size_t size);
     /** Default Destructor */
     virtual ~ICLMemoryRegion() = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -87,10 +85,9 @@ public:
     std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) override;
 
 protected:
-    cl::CommandQueue _queue;
-    cl::Context      _ctx;
-    void            *_mapping;
-    cl::Buffer       _mem;
+    cl::Context _ctx;
+    void       *_mapping;
+    cl::Buffer  _mem;
 };
 
 /** OpenCL buffer memory region implementation */
@@ -99,22 +96,21 @@ class CLBufferMemoryRegion final : public ICLMemoryRegion
 public:
     /** Constructor
      *
-     * @param[in] ctx   Runtime context
      * @param[in] flags Memory flags
      * @param[in] size  Region size
      */
-    CLBufferMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size);
+    CLBufferMemoryRegion(cl_mem_flags flags, size_t size);
     /** Constructor
      *
      * @param[in] buffer Buffer to be used as a memory region
-     * @param[in] ctx    Runtime context
      */
-    CLBufferMemoryRegion(const cl::Buffer &buffer, CLCoreRuntimeContext *ctx);
+    CLBufferMemoryRegion(const cl::Buffer &buffer);
+    virtual ~CLBufferMemoryRegion() override;
 
     // Inherited methods overridden :
     void *ptr() final;
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 
 /** OpenCL SVM memory region interface */
@@ -123,12 +119,11 @@ class ICLSVMMemoryRegion : public ICLMemoryRegion
 protected:
     /** Constructor
      *
-     * @param[in] ctx       Runtime context
      * @param[in] flags     Memory flags
      * @param[in] size      Region size
      * @param[in] alignment Alignment
      */
-    ICLSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment);
+    ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment);
     /** Destructor */
     virtual ~ICLSVMMemoryRegion();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -153,16 +148,15 @@ class CLCoarseSVMMemoryRegion final : public ICLSVMMemoryRegion
 public:
     /** Constructor
      *
-     * @param[in] ctx       Runtime context
      * @param[in] flags     Memory flags
      * @param[in] size      Region size
      * @param[in] alignment Alignment
      */
-    CLCoarseSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment);
+    CLCoarseSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment);
 
     // Inherited methods overridden :
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 
 /** OpenCL fine-grain SVM memory region implementation */
@@ -171,16 +165,15 @@ class CLFineSVMMemoryRegion final : public ICLSVMMemoryRegion
 public:
     /** Constructor
      *
-     * @param[in] ctx       Runtime context
      * @param[in] flags     Memory flags
      * @param[in] size      Region size
      * @param[in] alignment Alignment
      */
-    CLFineSVMMemoryRegion(CLCoreRuntimeContext *ctx, cl_mem_flags flags, size_t size, size_t alignment);
+    CLFineSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment);
 
     // Inherited methods overridden :
     void *map(cl::CommandQueue &q, bool blocking) final;
-    void unmap(cl::CommandQueue &q) final;
+    void  unmap(cl::CommandQueue &q) final;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_CL_CL_MEMORY_REGION_H */
diff --git a/arm_compute/runtime/CL/CLMultiImage.h b/arm_compute/runtime/CL/CLMultiImage.h
deleted file mode 100644
index e76ee07d29..0000000000
--- a/arm_compute/runtime/CL/CLMultiImage.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMULTIIMAGE_H
-#define ARM_COMPUTE_CLMULTIIMAGE_H
-
-#include "arm_compute/core/CL/ICLMultiImage.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic implementation of the CL multi-planar image interface */
-class CLMultiImage : public ICLMultiImage
-{
-public:
-    /** Constructor */
-    CLMultiImage();
-    /** Init the multi-planar image
-     *
-     * @param[in] width  Width of the whole image
-     * @param[in] height Heigth of the whole image
-     * @param[in] format Format of the whole image
-     */
-    void init(unsigned int width, unsigned int height, Format format);
-    /** Init the multi-planar image
-     *
-     * @note Uses conservative padding strategy which fits all kernels.
-     *
-     * @param[in] width  Width of the whole image
-     * @param[in] height Height of the whole image
-     * @param[in] format Format of the whole image
-     */
-    void init_auto_padding(unsigned int width, unsigned int height, Format format);
-    /** Allocated a previously initialised multi image
-     *
-     * @note The multi image must not already be allocated when calling this function.
-     *
-     **/
-    void allocate();
-
-    // Inherited methods overridden:
-    const MultiImageInfo *info() const override;
-    CLImage *cl_plane(unsigned int index) override;
-    const CLImage *cl_plane(unsigned int index) const override;
-
-private:
-    /** Init the multi-planar image
-     *
-     * @param[in] width        Width of the whole image
-     * @param[in] height       Height of the whole image
-     * @param[in] format       Format of the whole image
-     * @param[in] auto_padding Specifies whether the image uses auto padding
-     */
-    void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
-
-    MultiImageInfo _info;          /** Instance of the multi-planar image's meta data */
-    std::array<CLImage, 3> _plane; /* Instance CLImage to hold the planar's information */
-};
-}
-#endif /*ARM_COMPUTE_CLMULTIIMAGE_H */
diff --git a/arm_compute/runtime/CL/CLPyramid.h b/arm_compute/runtime/CL/CLPyramid.h
deleted file mode 100644
index 696091ebe1..0000000000
--- a/arm_compute/runtime/CL/CLPyramid.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPYRAMID_H
-#define ARM_COMPUTE_CLPYRAMID_H
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-
-#include <cstddef>
-#include <memory>
-
-namespace arm_compute
-{
-class CLTensor;
-
-/** Basic implementation of the OpenCL pyramid interface */
-class CLPyramid : public IPyramid
-{
-public:
-    /** Default constructor */
-    CLPyramid();
-    /** Initialize pyramid data-object using the given Pyramid's metadata
-     *
-     * @param[in] info Pyramid's metadata
-     */
-    void init(const PyramidInfo &info);
-
-    /** Initialize pyramid data-object using the given Pyramid's metadata
-     *
-     * @note Uses conservative padding strategy which fits all kernels.
-     *
-     * @param[in] info Pyramid's metadata
-     */
-    void init_auto_padding(const PyramidInfo &info);
-
-    /** Allocate the planes in the pyramid
-     *
-     * @note The pyramid must not already be allocated when calling this function.
-     *
-     **/
-    void allocate();
-
-    // Inherited method overridden
-    const PyramidInfo *info() const override;
-    CLTensor *get_pyramid_level(size_t index) const override;
-
-private:
-    /** Initialize pyramid data-object using the given Pyramid's metadata
-     *
-     * @param[in] info         Pyramid's metadata
-     * @param[in] auto_padding Specifies whether the image in the pyramid use auto padding
-     */
-    void internal_init(const PyramidInfo &info, bool auto_padding);
-
-    PyramidInfo                   _info;
-    mutable std::vector<CLTensor> _pyramid;
-};
-}
-#endif /*ARM_COMPUTE_CLPYRAMID_H */
diff --git a/arm_compute/runtime/CL/CLRuntimeContext.h b/arm_compute/runtime/CL/CLRuntimeContext.h
index 54c7d3cd23..2ed4b74796 100644
--- a/arm_compute/runtime/CL/CLRuntimeContext.h
+++ b/arm_compute/runtime/CL/CLRuntimeContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CLRUNTIME_CONTEXT_H
 #define ARM_COMPUTE_CLRUNTIME_CONTEXT_H
 
-#include "arm_compute/core/CL/CLCoreRuntimeContext.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/IScheduler.h"
 #include "arm_compute/runtime/RuntimeContext.h"
 
@@ -50,16 +50,15 @@ public:
     void set_gpu_scheduler(CLScheduler *scheduler);
 
     // Inherited overridden methods
-    CLScheduler          *gpu_scheduler();
-    CLKernelLibrary      &kernel_library();
-    CLCoreRuntimeContext *core_runtime_context();
+    CLScheduler     *gpu_scheduler();
+    CLKernelLibrary &kernel_library();
 
 private:
-    std::unique_ptr<CLScheduler> _gpu_owned_scheduler{ nullptr };
-    CLScheduler                 *_gpu_scheduler{ nullptr };
-    CLTuner                      _tuner{ false };
+    std::unique_ptr<CLScheduler> _gpu_owned_scheduler{nullptr};
+    CLScheduler                 *_gpu_scheduler{nullptr};
+    CLTuner                      _tuner{false};
     CLSymbols                    _symbols{};
-    CLCoreRuntimeContext         _core_context{};
+    CLBackendType                _backend_type{CLBackendType::Native};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 573248ebbe..b74fcb74ef 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,11 @@
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLGEMMHeuristicsHandle.h"
+#include "arm_compute/runtime/CL/CLHelpers.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/CL/ICLTuner.h"
 
 namespace arm_compute
@@ -55,33 +59,65 @@ public:
     /** Initialises the context and command queue used by the scheduler to default values
      *  and sets a default device and kernel path for the @ref CLKernelLibrary.
      *
-     * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
+     * @param[in] cl_tuner        (Optional) Pointer to ICLTuner (default=nullptr)
+     * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
+     * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void default_init(ICLTuner *cl_tuner = nullptr);
+    void default_init(ICLTuner               *cl_tuner        = nullptr,
+                      CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+                      CLBackendType           cl_backend_type = CLBackendType::Native);
     /** Initialises the scheduler with context and device provided by the user
      *
      * @param[in] device   OpenCL device to be used
      * @param[in] ctx      OpenCL ctx to be used
      * @param[in] cl_tuner (Optional) Pointer to ICLTuner (default=nullptr)
+     * @param[in] gemm_h   (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
+     */
+    void default_init_with_context(cl::Device             &device,
+                                   cl::Context            &ctx,
+                                   ICLTuner               *cl_tuner = nullptr,
+                                   CLGEMMHeuristicsHandle *gemm_h   = nullptr);
+
+    /** Re-initializes the context and command queue used by the scheduler to default values
+     *  and sets a default device and kernel path for the @ref CLKernelLibrary.
+     *
+     * @param[in] cl_tuner        (Optional) Pointer to ICLTuner (default=nullptr)
+     * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
+     * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner = nullptr);
+    void default_reinit(ICLTuner               *cl_tuner        = nullptr,
+                        CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+                        CLBackendType           cl_backend_type = CLBackendType::Native);
 
     /** Schedule the execution of the passed kernel if possible.
      *
      * @param[in] kernel Kernel to execute.
-     * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
+     * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
      */
     void enqueue(ICLKernel &kernel, bool flush = true);
-
+    /** Schedule the execution of the passed kernel if possible.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     * @param[in] flush   (Optional) Specifies if the command queue will be flushed after running the kernel. This will be ignored if job chaining is enabled.
+     */
+    void enqueue_op(ICLKernel &kernel, ITensorPack &tensors, bool flush = true);
     /** Initialises the context and command queue to be used by the scheduler.
      *
-     * @param[in] context  A CL context.
-     * @param[in] queue    A CL command queue.
-     * @param[in] device   A CL device.
-     * @param[in] cl_tuner (Optional) Pointer to OpenCL tuner (default=nullptr)
-     *                     Note: It is caller's responsibility to release the allocated memory for CLTuner
+     * @param[in] context         A CL context.
+     * @param[in] queue           A CL command queue.
+     * @param[in] device          A CL device.
+     * @param[in] cl_tuner        (Optional) Pointer to OpenCL tuner (default=nullptr)
+     *                            Note: It is caller's responsibility to release the allocated memory for CLTuner
+     * @param[in] gemm_h          (Optional) Pointer to CLGEMMHeuristicsHandle (default = nullptr)
+     * @param[in] cl_backend_type (Optional) Type of backend to use (default = CLBackendType::Native)
      */
-    void init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner = nullptr);
+    void init(cl::Context             context,
+              cl::CommandQueue        queue,
+              const cl::Device       &device,
+              ICLTuner               *cl_tuner        = nullptr,
+              CLGEMMHeuristicsHandle *gemm_h          = nullptr,
+              CLBackendType           cl_backend_type = CLBackendType::Native);
 
     /** Accessor for the associated CL context.
      *
@@ -101,6 +137,12 @@ public:
      */
     GPUTarget target() const;
 
+    /** Accessor for the associated CLGEMMHeuristicsHandle
+     *
+     * @return Pointer to CLGEMMHeuristicsHandle
+     */
+    CLGEMMHeuristicsHandle *gemm_heuristics() const;
+
     /** Accessor to set the CL context to be used by the scheduler.
      *
      * @param[in] context A CL context.
@@ -140,18 +182,35 @@ public:
      */
     void tune_kernel_static(ICLKernel &kernel);
 
+    /** Enable job chaining. The command queue will only be flushed when @p job_chaining_size kernels have been enqueued.
+     *
+     * @param[in] job_chaining_size Kernels to enqueue before flushing
+     */
+    void enable_job_chaining(int job_chaining_size);
+
     bool is_initialised() const;
 
 private:
+    void enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush);
+    /** If job chain is disabled, then flush the command queue according to @p flush. Otherwise @p flush is ignored and the queue is only flushed when job chain count exceeds allocated job chain size
+     *
+     * @param[in] flush Flush the command queue. Ignored when job chain is enabled.
+     */
+    void flush_queue(bool flush);
+
     /** Flag to ensure symbols initialisation is happening before Scheduler creation */
     static std::once_flag _initialize_symbols;
 
-    cl::Context               _context;
-    cl::CommandQueue          _queue;
-    GPUTarget                 _target;
-    bool                      _is_initialised;
-    ICLTuner                 *_cl_tuner;
-    std::unique_ptr<ICLTuner> _cl_default_static_tuner;
+    cl::Context             _context;
+    cl::CommandQueue        _queue;
+    GPUTarget               _target;
+    bool                    _is_initialised;
+    ICLTuner               *_cl_tuner;
+    CLGEMMHeuristicsHandle *_gemm_heuristics;
+    CLBackendType           _backend_type;
+    bool                    _job_chaining_enabled;
+    int                     _job_chaining_size;
+    int                     _job_chaining_count;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSCHEDULER_H */
diff --git a/arm_compute/runtime/CL/CLSubTensor.h b/arm_compute/runtime/CL/CLSubTensor.h
index 98dd28f43d..c18df8086a 100644
--- a/arm_compute/runtime/CL/CLSubTensor.h
+++ b/arm_compute/runtime/CL/CLSubTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,10 @@ public:
      * @param[in] coords        Coordinates of the first subtensor element inside the parent tensor.
      * @param[in] extend_parent (Optional) Extend parent with subtensor shape if subtensor indexes out of bounds
      */
-    CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent = false);
+    CLSubTensor(ICLTensor         *parent,
+                const TensorShape &tensor_shape,
+                const Coordinates &coords,
+                bool               extend_parent = false);
     /** Destructor: free the tensor's memory */
     ~CLSubTensor() = default;
     /** Restrict instances of this class to be copy constructed */
@@ -93,11 +96,11 @@ public:
 protected:
     // Inherited methods overridden:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
+    void     do_unmap(cl::CommandQueue &q) override;
 
 private:
     ICLTensor            *_parent;
     mutable SubTensorInfo _info;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLSUBTENSOR_H */
diff --git a/arm_compute/runtime/CL/CLTensor.h b/arm_compute/runtime/CL/CLTensor.h
index 102cb3636a..0729935e9e 100644
--- a/arm_compute/runtime/CL/CLTensor.h
+++ b/arm_compute/runtime/CL/CLTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -46,7 +46,7 @@ public:
      * @param[in] ctx (Optional)  Pointer to a @ref CLRuntimeContext.
      *                            If nullptr is passed in, the legacy api using the singletons will be used. Otherwise the memory for the
      *                            tensor will allocate on the context passed in.
-     *                            The singletons legacy api has been deprecated and will be removed.
+     *                            The singletons legacy api has been deprecated and will be removed in future releases.
      */
     CLTensor(IRuntimeContext *ctx = nullptr);
 
@@ -87,17 +87,17 @@ public:
     TensorInfo       *info() override;
     const cl::Buffer &cl_buffer() const override;
     CLQuantization    quantization() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
+    void              associate_memory_group(IMemoryGroup *memory_group) override;
     CLRuntimeContext *context();
 
 protected:
     // Inherited methods overridden:
     uint8_t *do_map(cl::CommandQueue &q, bool blocking) override;
-    void do_unmap(cl::CommandQueue &q) override;
+    void     do_unmap(cl::CommandQueue &q) override;
 
 private:
     mutable CLTensorAllocator _allocator; /**< Instance of the OpenCL tensor allocator */
-    CLRuntimeContext         *_ctx{ nullptr };
+    CLRuntimeContext         *_ctx{nullptr};
 };
 
 /** OpenCL Image */
diff --git a/arm_compute/runtime/CL/CLTensorAllocator.h b/arm_compute/runtime/CL/CLTensorAllocator.h
index 7c1bc03486..fde8e9c43a 100644
--- a/arm_compute/runtime/CL/CLTensorAllocator.h
+++ b/arm_compute/runtime/CL/CLTensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_CLTENSORALLOCATOR_H
 #define ARM_COMPUTE_CLTENSORALLOCATOR_H
 
+#include "arm_compute/core/CL/CLTypes.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLMemory.h"
+#include "arm_compute/runtime/IAllocator.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
-#include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/CL/OpenCL.h"
-
 #include <cstdint>
 
 namespace arm_compute
@@ -108,7 +108,7 @@ public:
     void free() override;
     /** Import an existing memory as a tensor's backing memory
      *
-     * @warning memory should have been created under the same context that ACL uses.
+     * @warning memory should have been created under the same context that Compute Library uses.
      * @warning memory is expected to be aligned with the device requirements.
      * @warning tensor shouldn't be memory managed.
      * @warning ownership of memory is not transferred.
@@ -127,6 +127,13 @@ public:
      */
     void set_associated_memory_group(IMemoryGroup *associated_memory_group);
 
+    /** Sets global allocator that will be used by all CLTensor objects
+     *
+     *
+     * @param[in] allocator Allocator to be used as a global allocator
+     */
+    static void set_global_allocator(IAllocator *allocator);
+
 protected:
     /** Call map() on the OpenCL buffer.
      *
@@ -140,7 +147,7 @@ private:
     static const cl::Buffer _empty_buffer;
 
 private:
-    CLRuntimeContext *_ctx;
+    CLRuntimeContext  *_ctx;
     IMemoryManageable *_owner;                   /**< Memory manageable object that owns the allocator */
     IMemoryGroup      *_associated_memory_group; /**< Registered memory manager */
     CLMemory           _memory;                  /**< OpenCL memory */
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 3b7a42fa6a..cf293d3d27 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_CLTUNER_H
 
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/utils/misc/Macros.h"
 #include "arm_compute/runtime/CL/CLTunerTypes.h"
+#include "arm_compute/runtime/CL/CLTuningParams.h"
 #include "arm_compute/runtime/CL/ICLTuner.h"
 
 #include <unordered_map>
@@ -41,9 +43,10 @@ public:
     /** Constructor
      *
      * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ?
+     * @param[in] tuning_info      (Optional) opencl parameters to tune
      *
      */
-    CLTuner(bool tune_new_kernels = true);
+    CLTuner(bool tune_new_kernels = true, CLTuningInfo tuning_info = CLTuningInfo());
 
     /** Destructor */
     ~CLTuner() = default;
@@ -53,42 +56,43 @@ public:
      * @param[in] tune_new_kernels Find the optimal local workgroup size for kernels which are not present in the table ?
      */
     void set_tune_new_kernels(bool tune_new_kernels);
-    /** Tune kernels that are not in the LWS table
+
+    /** Tune kernels that are not in the tuning parameters table
      *
      * @return True if tuning of new kernels is enabled.
      */
     bool tune_new_kernels() const;
 
-    /** Set OpenCL tuner mode
+    /** Setter for tune parameters option
      *
-     * @param[in] mode Indicates how exhaustive the search for the optimal LWS should be while tuning. Default is Exhaustive mode
+     * @param[in] tuning_info opencl parameters to tune
      */
-    void set_tuner_mode(CLTunerMode mode);
+    void set_tuning_parameters(CLTuningInfo tuning_info);
 
-    /** Get the current OpenCL tuner mode
+    /** Set OpenCL tuner mode
      *
-     * @return tuner_mode Indicates how exhaustive the search for the optimal LWS should be while tuning
+     * @param[in] mode Indicates how exhaustive the search for the optimal tuning parameters should be while tuning. Default is Exhaustive mode
      */
-    CLTunerMode get_tuner_mode() const;
+    void set_tuner_mode(CLTunerMode mode);
 
-    /** Manually add a LWS for a kernel
+    /** Manually add tuning parameters for a kernel
      *
-     * @param[in] kernel_id   Unique identifiant of the kernel
-     * @param[in] optimal_lws Optimal local workgroup size to use for the given kernel
+     * @param[in] kernel_id             Unique identifiant of the kernel
+     * @param[in] optimal_tuning_params Optimal tuning parameters to use for the given kernel
      */
-    void add_lws_to_table(const std::string &kernel_id, cl::NDRange optimal_lws);
+    void add_tuning_params(const std::string &kernel_id, CLTuningParams optimal_tuning_params);
 
-    /** Import LWS table
+    /** Import tuning parameters table
      *
-     * @param[in] lws_table The unordered_map container to import
+     * @param[in] tuning_params_table The unordered_map container to import
      */
-    void import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table);
+    void import_tuning_params(const std::unordered_map<std::string, CLTuningParams> &tuning_params_table);
 
-    /** Give read access to the LWS table
+    /** Give read access to the tuning params table
      *
-     * @return The lws table as unordered_map container
+     * @return The tuning params table as unordered_map container
      */
-    const std::unordered_map<std::string, cl::NDRange> &lws_table() const;
+    const std::unordered_map<std::string, CLTuningParams> &tuning_params_table() const;
 
     /** Set the OpenCL kernel event
      *
@@ -101,41 +105,57 @@ public:
     /** clEnqueueNDRangeKernel symbol */
     std::function<decltype(clEnqueueNDRangeKernel)> real_clEnqueueNDRangeKernel;
 
-    /** Load the LWS table from file
+    /** Load the tuning parameters table from file. It also sets up the tuning read from the file
+     *
+     * @param[in] filename Load the tuning parameters table from this file.(Must exist)
      *
-     * @param[in] filename Load the LWS table from this file.(Must exist)
      */
     void load_from_file(const std::string &filename);
 
-    /** Save the content of the LWS table to file
+    /** Save the content of the tuning parameters table to file
+     *
+     * @param[in] filename Save the tuning parameters table to this file. (Content will be overwritten)
      *
-     * @param[in] filename Save the LWS table to this file. (Content will be overwritten)
+     * @return true if the file was created
      */
-    void save_to_file(const std::string &filename) const;
+    bool save_to_file(const std::string &filename) const;
 
     // Inherited methods overridden:
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
-
+    void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) override;
     /** Is the kernel_event set ?
      *
      * @return true if the kernel_event is set.
      */
     bool kernel_event_is_set() const;
 
+    /** A wrapper wrapping tensors and other objects needed for running the kernel
+     */
+    struct IKernelData;
+
 private:
-    /** Find optimal LWS using brute-force approach
+    /** Perform tune_kernel_dynamic
+     *
+     * @param[in]     kernel OpenCL kernel to be tuned with tuning parameters
+     * @param[in,out] data   IKernelData object wrapping tensors and other objects needed for running the kernel
+     *
+     */
+    void do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data);
+    /** Find optimal tuning parameters using brute-force approach
      *
-     * @param[in] kernel OpenCL kernel to be tuned with LWS
+     * @param[in]     kernel OpenCL kernel to be tuned with tuning parameters
+     * @param[in,out] data   IKernelData object wrapping tensors and other objects needed for running the kernel
      *
-     * @return The optimal LWS to use
+     * @return The optimal tuning parameters to use
      */
-    cl::NDRange find_optimal_lws(ICLKernel &kernel);
+    CLTuningParams find_optimal_tuning_params(ICLKernel &kernel, IKernelData *data);
 
-    std::unordered_map<std::string, cl::NDRange> _lws_table;
-    cl::Event   _kernel_event;
-    bool        _tune_new_kernels;
-    CLTunerMode _tuner_mode;
+    std::unordered_map<std::string, CLTuningParams> _tuning_params_table;
+    std::unordered_map<std::string, cl::NDRange>    _lws_table;
+    cl::Event                                       _kernel_event;
+    bool                                            _tune_new_kernels;
+    CLTuningInfo                                    _tuning_info;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLTUNER_H */
diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index a3385b6d7c..d9b914676a 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,15 @@ enum class CLTunerMode
     RAPID       /**< Searches a minimal subset of LWS configurations while tuning */
 };
 
+/**< OpenCL tuner tuning information */
+struct CLTuningInfo
+{
+    CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */
+    bool        tune_wbsm  = false; /**< Flag to tune the batches of work groups distributed to compute units.
+                                                       Internally, the library will check if this feature is available on
+                                                       the target platform. This OpenCL tuner extension is still in experimental phase */
+};
+
 /** Converts a string to a strong types enumeration @ref CLTunerMode
  *
  * @param[in] name String to convert
@@ -47,11 +56,10 @@ enum class CLTunerMode
  */
 inline CLTunerMode tuner_mode_from_name(const std::string &name)
 {
-    static const std::map<std::string, CLTunerMode> tuner_modes =
-    {
-        { "exhaustive", CLTunerMode::EXHAUSTIVE },
-        { "normal", CLTunerMode::NORMAL },
-        { "rapid", CLTunerMode::RAPID },
+    static const std::map<std::string, CLTunerMode> tuner_modes = {
+        {"exhaustive", CLTunerMode::EXHAUSTIVE},
+        {"normal", CLTunerMode::NORMAL},
+        {"rapid", CLTunerMode::RAPID},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -62,7 +70,7 @@ inline CLTunerMode tuner_mode_from_name(const std::string &name)
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h
new file mode 100644
index 0000000000..a876fad112
--- /dev/null
+++ b/arm_compute/runtime/CL/CLTuningParams.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLTUNING_PARAMS_H
+#define ARM_COMPUTE_CLTUNING_PARAMS_H
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTunerTypes.h"
+
+#include "support/StringSupport.h"
+
+#include <ostream>
+
+namespace arm_compute
+{
+/**< OpenCL tuner parameters */
+class CLTuningParams
+{
+public:
+    CLTuningParams(const CLTuningParams &tuning_params) : _lws(tuning_params._lws), _wbsm(tuning_params._wbsm)
+    {
+    }
+
+    CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0, int wbsm = 0)
+        : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm)
+    {
+    }
+    CLTuningParams(cl::NDRange lws, cl_int wbsm = 0) : _lws(lws), _wbsm(wbsm)
+    {
+    }
+
+    CLTuningParams(cl_int wbsm) : CLTuningParams(cl::NullRange, wbsm)
+    {
+    }
+    CLTuningParams &operator=(const CLTuningParams &other)
+    {
+        _lws  = other._lws;
+        _wbsm = other._wbsm;
+        return *this;
+    }
+
+    void set_lws(cl::NDRange lws)
+    {
+        _lws = lws;
+    }
+
+    cl::NDRange get_lws() const
+    {
+        return _lws;
+    }
+
+    void set_wbsm(cl_int wbsm)
+    {
+        _wbsm = wbsm;
+    }
+
+    cl_int get_wbsm() const
+    {
+        return _wbsm;
+    }
+
+    std::string to_string(CLTuningInfo tuning_info)
+    {
+        std::string tuning_params_string = "";
+        tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) +
+                                ";" + support::cpp11::to_string(_lws[2]);
+        if (tuning_info.tune_wbsm)
+        {
+            tuning_params_string += ";" + support::cpp11::to_string(_wbsm);
+        }
+        return tuning_params_string;
+    }
+
+    bool from_string(CLTuningInfo tuning_info, std::string tuning_params_string)
+    {
+        std::replace(tuning_params_string.begin(), tuning_params_string.end(), ';', ' ');
+        std::vector<std::string> array;
+        std::stringstream        ss(tuning_params_string);
+        std::string              temp;
+        while (ss >> temp)
+        {
+            array.push_back(temp);
+        }
+        // Read 3 values for lws
+        if (array.size() < 3)
+        {
+            return false;
+        }
+        const unsigned int lws_0 = support::cpp11::stoi(array[0]);
+        const unsigned int lws_1 = support::cpp11::stoi(array[1]);
+        const unsigned int lws_2 = support::cpp11::stoi(array[2]);
+        if (lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
+        {
+            // If lws values are 0, cl::NullRange has to be used
+            // otherwise the lws object will be badly created
+            _lws = cl::NullRange;
+        }
+        else
+        {
+            _lws = cl::NDRange(lws_0, lws_1, lws_2);
+        }
+        array.erase(array.begin(), array.begin() + 3);
+        if (tuning_info.tune_wbsm)
+        {
+            if (array.size() < 1)
+            {
+                return false;
+            }
+            _wbsm = support::cpp11::stoi(array[0]);
+            array.erase(array.begin());
+        }
+        return true;
+    }
+
+private:
+    cl::NDRange _lws;
+    cl_int      _wbsm;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLTUNING_PARAMS_H */
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index 48697af35f..931740c47f 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,38 +23,39 @@
  */
 #ifndef ARM_COMPUTE_RUNTIME_CLTYPES_H
 #define ARM_COMPUTE_RUNTIME_CLTYPES_H
+#include "arm_compute/core/Types.h"
 
 namespace arm_compute
 {
 /** OpenCL GEMM kernel types */
 enum class CLGEMMKernelType
 {
-    /** Native GEMM kernel with fixed block size.
-     * @note Temporary variant to keep compatibility with the old implementation.
-     * @note This variant will be deprecated in favor of a new and configurable NATIVE variant
-     */
-    NATIVE_V1,
     /** Native GEMM kernel with configurable block size.*/
     NATIVE,
-    /** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Fixed block size fixed.
-     * @note Temporary variant to keep compatibility with the old implementation.
-     * @note This variant will be deprecated in favor of RESHAPED
-     */
-    RESHAPED_V1,
     /** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Configurable reshape and block size */
     RESHAPED,
     /** Reshaped GEMM kernel where only the rhs matrix is reshaped. Configurable reshape and block size */
-    RESHAPED_ONLY_RHS
+    RESHAPED_ONLY_RHS,
+    /** Reshaped GEMM kernel where only the rhs matrix is reshaped. Using MMUL with configurable block size. */
+    RESHAPED_ONLY_RHS_MMUL
 };
 
 /** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
 struct CLGEMMKernelSelectionParams
 {
-    unsigned int m{ 0 };                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
-    unsigned int n{ 0 };                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
-    unsigned int k{ 0 };                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
-    bool         is_rhs_constant{ false };       /**< True if the content of the rhs matrix is constant */
-    DataType     data_type{ DataType::UNKNOWN }; /**< Data type */
+    unsigned int m{0};                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n{0};                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k{0};                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int b{0};                         /**< Batch size */
+    bool         is_rhs_constant{false};       /**< True if the content of the rhs matrix is constant */
+    DataType     data_type{DataType::UNKNOWN}; /**< Data type */
+};
+
+/** List the possible OpenCL backends */
+enum class CLBackendType
+{
+    Native, /**< OpenCL native backend */
+    Clvk,   /**< CLVK backend */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_CLTYPES_H */
diff --git a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
index 69b941109d..5a71a61203 100644
--- a/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
+++ b/arm_compute/runtime/CL/ICLGEMMKernelSelection.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,8 +40,7 @@ public:
      *
      * @param[in] arch GPU target
      */
-    ICLGEMMKernelSelection(GPUTarget arch)
-        : _target(arch)
+    ICLGEMMKernelSelection(GPUTarget arch) : _target(arch)
     {
     }
     /** Default Move Constructor. */
@@ -59,7 +58,8 @@ public:
     virtual CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) = 0;
 
 protected:
-    GPUTarget _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
+    GPUTarget
+        _target; /**< GPU target could be used to call a dedicated heuristic for each GPU IP for a given GPU architecture */
 };
 } // namespace cl_gemm
 } // namespace arm_compute
diff --git a/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h b/arm_compute/runtime/CL/ICLOperator.h
index 245e6f1991..c0826e7733 100644
--- a/arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h
+++ b/arm_compute/runtime/CL/ICLOperator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,43 +21,49 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_IGCSIMPLEFUNCTION_H
-#define ARM_COMPUTE_IGCSIMPLEFUNCTION_H
+#ifndef ARM_COMPUTE_ICLOPERATOR_H
+#define ARM_COMPUTE_ICLOPERATOR_H
 
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h"
-#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
 
 #include <memory>
 
 namespace arm_compute
 {
-/** Basic interface for functions which have a single OpenGL ES kernel */
-class IGCSimpleFunction : public IFunction
+class ICLKernel;
+namespace experimental
+{
+/** Basic interface for functions which have a single async CL kernel */
+class ICLOperator : public IOperator
 {
 public:
-    /** Default Constructor
+    /** Constructor
      *
      * @param[in] ctx Runtime context to be used by the function
      */
-    IGCSimpleFunction(GCRuntimeContext *ctx = nullptr);
+    ICLOperator(IRuntimeContext *ctx = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCSimpleFunction(const IGCSimpleFunction &) = delete;
+    ICLOperator(const ICLOperator &) = delete;
     /** Default move constructor */
-    IGCSimpleFunction(IGCSimpleFunction &&) = default;
+    ICLOperator(ICLOperator &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCSimpleFunction &operator=(const IGCSimpleFunction &) = delete;
+    ICLOperator &operator=(const ICLOperator &) = delete;
     /** Default move assignment operator */
-    IGCSimpleFunction &operator=(IGCSimpleFunction &&) = default;
+    ICLOperator &operator=(ICLOperator &&) = default;
 
     // Inherited methods overridden:
-    void run() override final;
+    void               run(ITensorPack &tensors) override;
+    void               prepare(ITensorPack &constants) override;
+    MemoryRequirements workspace() const override;
 
 protected:
-    std::unique_ptr<IGCKernel> _kernel;         /**< Kernel to run */
-    GCFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
-    GCRuntimeContext          *_ctx;            /**< Context to use */
+    std::unique_ptr<ICLKernel> _kernel;
+    IRuntimeContext           *_ctx;
+    MemoryRequirements         _workspace;
 };
+} // namespace experimental
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_IGCSIMPLEFUNCTION_H */
+#endif /*ARM_COMPUTE_ICLOPERATOR_H */
diff --git a/arm_compute/runtime/CL/ICLSimpleFunction.h b/arm_compute/runtime/CL/ICLSimpleFunction.h
index 5fc956d861..310bf770c4 100644
--- a/arm_compute/runtime/CL/ICLSimpleFunction.h
+++ b/arm_compute/runtime/CL/ICLSimpleFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_ICLSIMPLEFUNCTION_H
 #define ARM_COMPUTE_ICLSIMPLEFUNCTION_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
@@ -34,6 +32,8 @@ namespace arm_compute
 {
 // Forward declarations
 class CLRuntimeContext;
+class CLFillBorderKernel;
+class ICLKernel;
 
 /** Basic interface for functions which have a single OpenCL kernel */
 class ICLSimpleFunction : public IFunction
@@ -53,14 +53,16 @@ public:
     ICLSimpleFunction &operator=(const ICLSimpleFunction &) = delete;
     /** Default move assignment operator */
     ICLSimpleFunction &operator=(ICLSimpleFunction &&) = default;
+    /** Default destructor */
+    ~ICLSimpleFunction();
 
     // Inherited methods overridden:
     void run() override final;
 
 protected:
-    std::unique_ptr<ICLKernel> _kernel;         /**< Kernel to run */
-    CLFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
-    CLRuntimeContext          *_ctx;            /**< Context to use */
+    std::unique_ptr<ICLKernel>          _kernel;         /**< Kernel to run */
+    std::unique_ptr<CLFillBorderKernel> _border_handler; /**< Kernel to handle  borders */
+    CLRuntimeContext                   *_ctx;            /**< Context to use */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index ba4eeac8e0..fa7a1424b8 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTUNER_H
 #define ARM_COMPUTE_ICLTUNER_H
 
+#include "arm_compute/core/experimental/Types.h"
+
 namespace arm_compute
 {
 class ICLKernel;
@@ -49,6 +51,12 @@ public:
      * @param[in] kernel Kernel to tune
      */
     virtual void tune_kernel_dynamic(ICLKernel &kernel) = 0;
+    /** Tune OpenCL kernel dynamically
+     *
+     * @param[in]      kernel  Kernel to tune
+     * @param[in, out] tensors Tensors for the kernel to use
+     */
+    virtual void tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors) = 0;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTUNER_H */
diff --git a/arm_compute/runtime/CL/Utils.h b/arm_compute/runtime/CL/Utils.h
index e317569ebe..c6992139e4 100644
--- a/arm_compute/runtime/CL/Utils.h
+++ b/arm_compute/runtime/CL/Utils.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h b/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
deleted file mode 100644
index 26aded6def..0000000000
--- a/arm_compute/runtime/CL/functions/CLAbsoluteDifference.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H
-#define ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLAbsoluteDifferenceKernel
- *
- * @note The tensor data types for the inputs must be U8 or S16.
- * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
- */
-class CLAbsoluteDifference : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input1 First input tensor. Data types supported: U8, S16
-     * @param[in]  input2 Second input tensor. Data types supported: U8, S16
-     * @param[out] output Output tensor. Data types supported: U8, S16
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First input tensor. Data types supported: U8, S16
-     * @param[in]  input2          Second input tensor. Data types supported: U8, S16
-     * @param[out] output          Output tensor. Data types supported: U8, S16
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
-};
-}
-#endif /* ARM_COMPUTE_CLABSOLUTEDIFFERENCE_H */
diff --git a/arm_compute/runtime/CL/functions/CLAccumulate.h b/arm_compute/runtime/CL/functions/CLAccumulate.h
deleted file mode 100644
index b47f0c0e4a..0000000000
--- a/arm_compute/runtime/CL/functions/CLAccumulate.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLACCUMULATE_H
-#define ARM_COMPUTE_CLACCUMULATE_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLAccumulateKernel */
-class CLAccumulate : public ICLSimpleFunction
-{
-public:
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  input Source tensor. Data types supported: U8.
-     * @param[out] accum Destination tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, ICLTensor *accum);
-    /** Set the input and accumulation tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] accum           Destination tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *accum);
-};
-
-/** Basic function to run @ref CLAccumulateWeightedKernel */
-class CLAccumulateWeighted : public ICLSimpleFunction
-{
-public:
-    /** Set the input and accumulation tensors, and the scale value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     alpha The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const ICLTensor *input, float alpha, ICLTensor *accum);
-    /** Set the input and accumulation tensors, and the scale value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     alpha           The input scalar value with a value input the range of [0, 1.0]. Data types supported: F32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: U8.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, float alpha, ICLTensor *accum);
-};
-
-/** Basic function to run @ref CLAccumulateSquaredKernel */
-class CLAccumulateSquared : public ICLSimpleFunction
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data types supported: U8.
-     * @param[in]     shift The input with a value input the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           Source tensor. Data types supported: U8.
-     * @param[in]     shift           The input with a value input the range of [0, 15]. Data types supported: U32.
-     * @param[in,out] accum           Accumulated tensor. Data types supported: S16.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, uint32_t shift, ICLTensor *accum);
-};
-}
-#endif /*ARM_COMPUTE_CLACCUMULATE_H */
diff --git a/arm_compute/runtime/CL/functions/CLActivationLayer.h b/arm_compute/runtime/CL/functions/CLActivationLayer.h
index fbb34e5fb9..e158efa093 100644
--- a/arm_compute/runtime/CL/functions/CLActivationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,22 @@
 #ifndef ARM_COMPUTE_CLACTIVATIONLAYER_H
 #define ARM_COMPUTE_CLACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLActivationLayerKernel
+/** Basic function to run @ref opencl::kernels::ClActivationKernel
  *
  * @note The function simulates an activation layer with the specified activation function.
  */
-class CLActivationLayer : public ICLSimpleFunction
+class CLActivationLayer : public IFunction
 {
 public:
     /** Constructor
@@ -44,16 +47,31 @@ public:
      * @param[in] ctx Runtime context to be used by the function
      */
     CLActivationLayer(CLRuntimeContext *ctx = nullptr);
+    /** Destructor */
+    ~CLActivationLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLActivationLayer(const CLActivationLayer &) = delete;
     /** Default move constructor */
-    CLActivationLayer(CLActivationLayer &&) = default;
+    CLActivationLayer(CLActivationLayer &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLActivationLayer &operator=(const CLActivationLayer &) = delete;
     /** Default move assignment operator */
-    CLActivationLayer &operator=(CLActivationLayer &&) = default;
+    CLActivationLayer &operator=(CLActivationLayer &&);
     /** Set the input and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
@@ -72,7 +90,10 @@ public:
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      act_info        Activation layer parameters.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLActivationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
@@ -83,6 +104,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index b0d29bcefe..d340d20a1f 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CLARGMINMAXLAYER_H
 #define ARM_COMPUTE_CLARGMINMAXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,6 +35,7 @@ namespace arm_compute
 {
 class ITensorInfo;
 class ICLTensor;
+class CLArgMinMaxLayerKernel;
 
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
@@ -53,9 +53,31 @@ public:
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLArgMinMaxLayer(const CLArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLArgMinMaxLayer &operator=(const CLArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLArgMinMaxLayer(CLArgMinMaxLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLArgMinMaxLayer &operator=(CLArgMinMaxLayer &&) = delete;
+    /** Default destructor */
+    ~CLArgMinMaxLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Input source tensor. Data types supported: QASYMM8/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst       |
+     * |:--------------|:---------|
+     * |QASYMM8        |U32, S32  |
+     * |QASYMM8_SIGNED |U32, S32  |
+     * |S32            |U32, S32  |
+     * |F16            |U32, S32  |
+     * |F32            |U32, S32  |
+     *
+     * @param[in]  input  Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
      * @param[out] output Output source tensor. Data types supported: U32/S32.
      * @param[in]  op     Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
@@ -64,15 +86,19 @@ public:
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input source tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  input           Input source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in]  axis            Axis to find max/min index.
      * @param[out] output          Output source tensor. Data types supported: U32/S32.
      * @param[in]  op              Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *input,
+                   int                       axis,
+                   ICLTensor                *output,
+                   const ReductionOperation &op);
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayer
      *
-     * @param[in] input  Input source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] input  Input source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/S32/F16/F32.
      * @param[in] axis   Axis to find max/min index.
      * @param[in] output Output source tensor info. Data types supported: U32/S32.
      * @param[in] op     Reduction operation to perform. Operations supported: ARG_IDX_MAX, ARG_IDX_MIN
@@ -85,13 +111,11 @@ public:
     void run() override;
 
 private:
-    MemoryGroup                         _memory_group;
-    std::vector<CLTensor>               _results_vector;
-    CLTensor                            _not_reshaped_output;
-    std::vector<CLArgMinMaxLayerKernel> _reduction_kernels_vector;
-    CLReshapeLayerKernel                _reshape_kernel;
-    unsigned int                        _num_of_stages;
-    unsigned int                        _reduction_axis;
+    MemoryGroup                             _memory_group;
+    CLTensor                                _not_reshaped_output;
+    std::unique_ptr<CLArgMinMaxLayerKernel> _arg_min_max_kernel;
+    CLReshapeLayer                          _reshape;
+    unsigned int                            _reduction_axis;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
index a211ea6b04..f57bc8fe8b 100644
--- a/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,18 @@
 #ifndef ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H
 
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
+class CLBatchNormalizationLayerKernel;
 
 /** Basic function to run @ref CLNormalizationLayerKernel and simulate a batch normalization layer.
  *
@@ -44,8 +48,28 @@ class CLBatchNormalizationLayer : public IFunction
 public:
     /** Default constructor */
     CLBatchNormalizationLayer();
+    /** Prevent instances of this class from being copied */
+    CLBatchNormalizationLayer(const CLBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLBatchNormalizationLayer &operator=(const CLBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchNormalizationLayer(CLBatchNormalizationLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchNormalizationLayer &operator=(CLBatchNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~CLBatchNormalizationLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
+     *
      * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
@@ -59,7 +83,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -77,9 +107,15 @@ public:
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr,
-                   float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayer
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -95,16 +131,20 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
+    std::unique_ptr<CLBatchNormalizationLayerKernel> _norm_kernel; /**< BatchNormalization layer kernel to run */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
index 6edb4641fe..20b9fdafed 100644
--- a/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,16 @@
 #ifndef ARM_COMPUTE_CLBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_CLBATCHTOSPACELAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class ITensorInfo;
+class CLBatchToSpaceLayerKernel;
 class ICLTensor;
 
 /** Basic function to run @ref CLBatchToSpaceLayerKernel. */
@@ -39,12 +42,34 @@ class CLBatchToSpaceLayer : public IFunction
 public:
     /** Default constructor */
     CLBatchToSpaceLayer();
+    /** Prevent instances of this class from being copied */
+    CLBatchToSpaceLayer(const CLBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLBatchToSpaceLayer &operator=(const CLBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchToSpaceLayer(CLBatchToSpaceLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLBatchToSpaceLayer &operator=(CLBatchToSpaceLayer &&) = delete;
+    /** Default destructor */
+    ~CLBatchToSpaceLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |dst        |
+     * |:---------|:---------|:----------|
+     * |All       |s32       |All        |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
     void configure(const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
     /** Set the input and output tensors.
      *
@@ -52,16 +77,27 @@ public:
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Set the input and output tensors. (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info     Information about how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   int32_t          block_shape_x,
+                   int32_t          block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info = CropInfo{});
     /** Set the input and output tensors. (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -69,8 +105,14 @@ public:
      * @param[in]  block_shape_x   Block shape x value.
      * @param[in]  block_shape_y   Block shape y value.
      * @param[out] output          Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info       Information about how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   int32_t                 block_shape_x,
+                   int32_t                 block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -78,7 +120,10 @@ public:
      * @param[out] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
     static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape).
      *
@@ -86,16 +131,21 @@ public:
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output info. Data types supported: same as @p input
+     * @param[in]  crop_info     Information about how the output shape is cropped after batch to space is performed
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLBatchToSpaceLayerKernel _batch_to_space_kernel; /**< CLBatchToSpaceLayerKernel to run */
+    std::unique_ptr<CLBatchToSpaceLayerKernel> _batch_to_space_kernel; /**< CLBatchToSpaceLayerKernel to run */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
index 1faded04fe..f82af3af9b 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseAnd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLBitwiseAndKernel.
+/** Basic function to perform bitwise AND by running @ref CLBitwiseKernel.
  *
  * @note The tensor data type for the inputs must be U8.
  * @note The function performs a bitwise AND operation using the two input tensors.
@@ -40,6 +41,14 @@ class CLBitwiseAnd : public ICLSimpleFunction
 public:
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: U8.
      * @param[out] output Output tensor. Data types supported: U8.
@@ -52,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEAND_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseNot.h b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
index c9460555dd..31f8e86802 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseNot.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseNot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLBitwiseNotKernel.
+/** Basic function to perform bitwise NOT by running @ref CLBitwiseKernel.
  *
  * @note The tensor data type for the inputs must be U8.
  * @note The function performs a bitwise NOT operation on input tensor.
@@ -39,6 +40,13 @@ class CLBitwiseNot : public ICLSimpleFunction
 {
 public:
     /** Initialize the function
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
      *
      * @param[in]  input  Input tensor. Data types supported: U8.
      * @param[out] output Output tensor. Data types supported: U8.
@@ -52,5 +60,5 @@ public:
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISENOT_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseOr.h b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
index 4fb93cc8a2..9a25a2099e 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseOr.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseOr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLBitwiseOrKernel.
+/** Basic function to perform bitwise OR by running @ref CLBitwiseKernel.
  *
  * @note The tensor data type for the inputs must be U8.
  * @note The function performs a bitwise OR operation using the two input tensors.
@@ -40,6 +41,14 @@ class CLBitwiseOr : public ICLSimpleFunction
 public:
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: U8.
      * @param[out] output Output tensor. Data types supported: U8.
@@ -52,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBitwiseXor.h b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
index 6caa013607..9e288ef7b6 100644
--- a/arm_compute/runtime/CL/functions/CLBitwiseXor.h
+++ b/arm_compute/runtime/CL/functions/CLBitwiseXor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Basic function to run @ref CLBitwiseXorKernel.
+/** Basic function to perform bitwise XOR by running @ref CLBitwiseKernel.
  *
  * @note The tensor data type for the inputs must be U8.
  * @note The function performs a bitwise XOR operation using the two input tensors.
@@ -40,6 +41,14 @@ class CLBitwiseXor : public ICLSimpleFunction
 public:
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input1 Input tensor. Data types supported: U8.
      * @param[in]  input2 Input tensor. Data types supported: U8.
      * @param[out] output Output tensor. Data types supported: U8.
@@ -52,7 +61,10 @@ public:
      * @param[in]  input2          Input tensor. Data types supported: U8.
      * @param[out] output          Output tensor. Data types supported: U8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBITWISEXOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
index b09359dfc2..dba5497f5d 100644
--- a/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
+++ b/arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,16 @@
 #ifndef ARM_COMPUTE_CLBOUNDINGBOXTRANSOFORM_H
 #define ARM_COMPUTE_CLBOUNDINGBOXTRANSOFORM_H
 
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLBoundingBoxTransformKernel;
+class BoundingBoxTransformInfo;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLBoundingBoxTransformKernel.
  *
@@ -41,6 +45,17 @@ class CLBoundingBoxTransform : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM16       |QASYMM8        |QASYMM16       |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
      * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
      * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
@@ -49,7 +64,10 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -61,7 +79,11 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -75,7 +97,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/CL/functions/CLBox3x3.h b/arm_compute/runtime/CL/functions/CLBox3x3.h
deleted file mode 100644
index a4cf4d296b..0000000000
--- a/arm_compute/runtime/CL/functions/CLBox3x3.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLBOX3X3_H
-#define ARM_COMPUTE_CLBOX3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute box filter 3x3. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- *  -# @ref CLBox3x3Kernel
- *
- */
-class CLBox3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLBOX3X3_H */
diff --git a/arm_compute/runtime/CL/functions/CLCannyEdge.h b/arm_compute/runtime/CL/functions/CLCannyEdge.h
deleted file mode 100644
index 2729d241a9..0000000000
--- a/arm_compute/runtime/CL/functions/CLCannyEdge.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCANNYEDGE_H
-#define ARM_COMPUTE_CLCANNYEDGE_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLCannyEdgeKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute canny edge on OpenCL. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
- * -# @ref CLSobel3x3 (if gradient_size == 3) or @ref CLSobel5x5 (if gradient_size == 5) or @ref CLSobel7x7 (if gradient_size == 7)
- * -# @ref CLGradientKernel
- * -# @ref CLEdgeNonMaxSuppressionKernel
- * -# @ref CLEdgeTraceKernel
- *
- */
-class CLCannyEdge : public IFunction
-{
-public:
-    /** Constructor */
-    CLCannyEdge(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCannyEdge(const CLCannyEdge &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLCannyEdge &operator=(const CLCannyEdge &) = delete;
-    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr             Upper threshold used for the hysteresis.
-     * @param[in]     lower_thr             Lower threshold used for the hysteresis.
-     * @param[in]     gradient_size         Gradient size (3, 5 or 7).
-     * @param[in]     norm_type             Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     upper_thr             Upper threshold used for the hysteresis.
-     * @param[in]     lower_thr             Lower threshold used for the hysteresis.
-     * @param[in]     gradient_size         Gradient size (3, 5 or 7).
-     * @param[in]     norm_type             Normalization type. if 1, L1-Norm otherwise L2-Norm.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    virtual void run() override;
-
-private:
-    MemoryGroup                   _memory_group;                                    /**< Function's memory group */
-    std::unique_ptr<IFunction>    _sobel;                                           /**< Pointer to Sobel kernel. */
-    CLGradientKernel              _gradient;                                        /**< Gradient kernel. */
-    CLFillBorderKernel            _border_mag_gradient;                             /**< Fill border on magnitude tensor kernel */
-    CLEdgeNonMaxSuppressionKernel _non_max_suppr;                                   /**< Non-Maxima suppression kernel. */
-    CLEdgeTraceKernel             _edge_trace;                                      /**< Edge tracing kernel. */
-    CLImage                       _gx;                                              /**< Source tensor - Gx component. */
-    CLImage                       _gy;                                              /**< Source tensor - Gy component. */
-    CLImage                       _mag;                                             /**< Source tensor - Magnitude. */
-    CLImage                       _phase;                                           /**< Source tensor - Phase. */
-    CLImage                       _nonmax;                                          /**< Source tensor - Non-Maxima suppressed. */
-    CLImage                       _visited, _recorded, _l1_list_counter, _l1_stack; /**< Temporary tensors */
-    ICLTensor                    *_output;                                          /**< Output tensor provided by the user. */
-};
-}
-
-#endif /* ARM_COMPUTE_CLCANNYEDGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLCast.h b/arm_compute/runtime/CL/functions/CLCast.h
index 6a1835c73a..9433f08fac 100644
--- a/arm_compute/runtime/CL/functions/CLCast.h
+++ b/arm_compute/runtime/CL/functions/CLCast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,66 +25,77 @@
 #define ARM_COMPUTE_CLCAST_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLDepthConvertLayerKernel. */
-class CLCast : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClCastKernel */
+class CLCast : public IFunction
 {
 public:
+    /** Constructor */
+    CLCast();
+    /** Destructor */
+    ~CLCast();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCast(const CLCast &) = delete;
+    /** Default move constructor */
+    CLCast(CLCast &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCast &operator=(const CLCast &) = delete;
+    /** Default move assignment operator */
+    CLCast &operator=(CLCast &&);
     /** Initialize the function's source, destination
      *
-     * Input data type must be different than output data type.
+     * Valid data layouts:
+     * - All
      *
-     * Valid conversions Input -> Output :
+     * Valid data type configurations:
+     * |src            |dst                                    |
+     * |:--------------|:--------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |S8             | U8, U16, S16, U32, S32, F16, F32      |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
+     * |U64            | U8, S8, U16, S16, U32, S32, F16, F32  |
+     * |S64            | U8, S8, U16, S16, U32, S32, F16, F32  |
+     * |F16            | U8, S8, U16, S16, S32, U32, F32       |
+     * |F32            | U8, S8, U16, S16, S32, U32, F16       |
      *
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
+     * Input data type must be different than output data type.
      *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
      * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy Conversion policy.
      */
     void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
-    /** Initialize the function's source, destination
-     *
-     * Input data type must be different than output data type.
-     *
-     * Valid conversions Input -> Output :
-     *
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[out] output          The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in]  policy          Conversion policy.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
+    // Initialize the function's source, destination
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCast
      *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/U64/S64/F16/F32.
      * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in] policy Conversion policy.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLCAST_H*/
diff --git a/arm_compute/runtime/CL/functions/CLChannelCombine.h b/arm_compute/runtime/CL/functions/CLChannelCombine.h
deleted file mode 100644
index 474830d7af..0000000000
--- a/arm_compute/runtime/CL/functions/CLChannelCombine.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELCOMBINE_H
-#define ARM_COMPUTE_CLCHANNELCOMBINE_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to run @ref CLChannelCombineKernel to perform channel combination. */
-class CLChannelCombine : public ICLSimpleFunction
-{
-public:
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3 The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output The single planar output tensor.
-     */
-    void configure(const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[in]  plane3          The 2D plane that forms channel 3. Must be of U8 format.
-     * @param[out] output          The single planar output tensor.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *plane0, const ICLTensor *plane1, const ICLTensor *plane2, const ICLTensor *plane3, ICLTensor *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1 The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2 The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output The multi planar output image.
-     */
-    void configure(const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  plane0          The 2D plane that forms channel 0. Must be of U8 format.
-     * @param[in]  plane1          The 2D plane that forms channel 1. Must be of U8 format.
-     * @param[in]  plane2          The 2D plane that forms channel 2. Must be of U8 format.
-     * @param[out] output          The multi planar output image.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *plane0, const ICLImage *plane1, const ICLImage *plane2, ICLMultiImage *output);
-};
-}
-#endif /*ARM_COMPUTE_CLCHANNELCOMBINE_H*/
diff --git a/arm_compute/runtime/CL/functions/CLChannelExtract.h b/arm_compute/runtime/CL/functions/CLChannelExtract.h
deleted file mode 100644
index aa25516e18..0000000000
--- a/arm_compute/runtime/CL/functions/CLChannelExtract.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCHANNELEXTRACT_H
-#define ARM_COMPUTE_CLCHANNELEXTRACT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to run @ref CLChannelExtractKernel to perform channel extraction. */
-class CLChannelExtract : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input   The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel The channel to extract.
-     * @param[out] output  The extracted channel. Must be of U8 format.
-     */
-    void configure(const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The input tensor to extract the channel from. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel         The channel to extract.
-     * @param[out] output          The extracted channel. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, Channel channel, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input   The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel The channel to extract.
-     * @param[out] output  The extracted 2D channel. Must be of U8 format.
-     */
-    void configure(const ICLMultiImage *input, Channel channel, ICLImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel         The channel to extract.
-     * @param[out] output          The extracted 2D channel. Must be of U8 format.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, Channel channel, ICLImage *output);
-};
-}
-#endif /*ARM_COMPUTE_CLCHANNELEXTRACT_H*/
diff --git a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
index 183a2f1ea6..8ca848a020 100644
--- a/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
+++ b/arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,14 @@
 #ifndef ARM_COMPUTE_CLCHANNELSHUFFLELAYER_H
 #define ARM_COMPUTE_CLCHANNELSHUFFLELAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLChannelShuffleLayerKernel
  *
@@ -41,6 +44,15 @@ class CLChannelShuffleLayer : public ICLSimpleFunction
 public:
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - NCHW
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input      Input tensor. Data types supported: All.
      * @param[out] output     Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
@@ -53,7 +65,10 @@ public:
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLColorConvert.h b/arm_compute/runtime/CL/functions/CLColorConvert.h
deleted file mode 100644
index 8721e8afa1..0000000000
--- a/arm_compute/runtime/CL/functions/CLColorConvert.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOLORCONVERT_H
-#define ARM_COMPUTE_CLCOLORCONVERT_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLMultiImage;
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to run @ref CLColorConvertKernel
- *
- * @note The function performs color convert between images.
- */
-class CLColorConvert : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output          Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const ICLMultiImage *input, ICLImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const ICLImage *input, ICLMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output          Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const ICLMultiImage *input, ICLMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output          Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const CLCompileContext &compile_context, const ICLMultiImage *input, ICLMultiImage *output);
-};
-}
-#endif /* ARM_COMPUTE_CLCOLORCONVERT_H */
diff --git a/arm_compute/runtime/CL/functions/CLComparison.h b/arm_compute/runtime/CL/functions/CLComparison.h
index 4e681e73a7..fca4b168b0 100644
--- a/arm_compute/runtime/CL/functions/CLComparison.h
+++ b/arm_compute/runtime/CL/functions/CLComparison.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,9 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLComparisonKernel */
 class CLComparison : public ICLSimpleFunction
@@ -38,6 +40,14 @@ class CLComparison : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs and outputs.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |dst      |
+     * |:--------|:--------|:--------|
+     * |All      |All      |U8       |
+     *
      * @param[in]  input1    Source tensor. Data types supported: All.
      *                       The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in]  input2    Source tensor. Data types supported: Same as @p input1.
@@ -56,7 +66,11 @@ public:
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[out] operation       Comparison operation to be used.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input1,
+                   ICLTensor              *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparison
      *
      * @param[in]  input1    Source tensor. Data types supported: All.
@@ -66,7 +80,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 };
 
 /** Basic function to run @ref CLComparisonKernel */
@@ -79,7 +96,7 @@ public:
 public:
     /** Initialise the kernel's inputs and outputs.
      *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1 Source tensor. Data types supported: All.
      *                    The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in]  input2 Source tensor. Data types supported: Same as @p input1.
      *                    The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -92,7 +109,7 @@ public:
     /** Initialise the kernel's inputs and outputs.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input1          Source tensor. Data types supported: All.
      *                             The input1 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in]  input2          Source tensor. Data types supported: Same as @p input1.
      *                             The input2 tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
@@ -101,7 +118,7 @@ public:
     void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparison
      *
-     * @param[in] input1 Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input1 Source tensor. Data types supported: All.
      * @param[in] input2 Source tensor. Data types supported: Same as @p input1.
      * @param[in] output Destination tensor. Data types supported: U8.
      *
diff --git a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h b/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
deleted file mode 100644
index 15c5bfeb7d..0000000000
--- a/arm_compute/runtime/CL/functions/CLComputeAllAnchors.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2019-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCOMPUTEALLANCHORS_H
-#define ARM_COMPUTE_CLCOMPUTEALLANCHORS_H
-
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLComputeAllAnchorsKernel.
- *
- * This function calls the following OpenCL kernels:
- * -# @ref CLComputeAllAnchorsKernel
- */
-class CLComputeAllAnchors : public ICLSimpleFunction
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A) where A is the number of anchors. Data types supported: F16/F32
-     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  anchors         Source tensor. Original set of anchors of size (4, A) where A is the number of anchors. Data types supported: F16/F32
-     * @param[out] all_anchors     Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
-     *
-     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A) where A is the number of anchors. Data types supported: F16/F32
-     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLCOMPUTEALLANCOHORS_H */
diff --git a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
index b8e3361e9e..88c4bed595 100644
--- a/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConcatenateLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,8 @@
 #ifndef ARM_COMPUTE_CLCONCATENATELAYER_H
 #define ARM_COMPUTE_CLCONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 #include <vector>
@@ -35,72 +33,92 @@
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ICLKernel;
 class ITensorInfo;
 class Status;
 
 /** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
  *
- * -# @ref CLWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
- * -# @ref CLHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref CLDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
- * -# @ref CLBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
+ * -# @ref opencl::kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref opencl::kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref opencl::kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref opencl::kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
  */
 class CLConcatenateLayer : public IFunction
 {
 public:
     /** Default constructor */
     CLConcatenateLayer();
+    /** Destructor */
+    ~CLConcatenateLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConcatenateLayer(const CLConcatenateLayer &) = delete;
+    /** Default move constructor */
+    CLConcatenateLayer(CLConcatenateLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConcatenateLayer &operator=(const CLConcatenateLayer &) = delete;
+    /** Default move assignment operator */
+    CLConcatenateLayer &operator=(CLConcatenateLayer &&);
     /** Initialise the kernel's inputs vector and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: All.
+     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     void configure(std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
     /** Initialise the kernel's inputs vector and output.
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: All.
+     * @param[in,out] inputs_vector   The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output          Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const CLCompileContext &compile_context, std::vector<ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
-    void configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis);
+    void configure(const CLCompileContext         &compile_context,
+                   std::vector<const ICLTensor *> &inputs_vector,
+                   ICLTensor                      *output,
+                   size_t                          axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref CLWidthConcatenateLayerKernel, @ref CLHeightConcatenateLayerKernel and @ref CLDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref opencl::kernels::ClWidthConcatenateKernel,
+     *       @ref opencl::kernels::ClHeightConcatenateKernel and @ref opencl::kernels::ClDepthConcatenateKernel.
      *
-     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: All.
+     * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
      * @param[in] axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      *
      * @return a status
      */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    template <typename TensorType>
-    void configure_internal(const CLCompileContext &compile_context, std::vector<TensorType *> &&inputs_vector, ICLTensor *output, size_t axis);
-
-    template <typename TensorInfoType>
-    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis);
-
-    std::vector<std::unique_ptr<ICLKernel>> _concat_kernels;
-    unsigned int                            _num_inputs;
-    unsigned int                            _axis;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCONCATENATELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLConv3D.h b/arm_compute/runtime/CL/functions/CLConv3D.h
new file mode 100644
index 0000000000..aabaf01ab7
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLConv3D.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONVOLUTION3DLAYER_H
+#define ARM_COMPUTE_CLCONVOLUTION3DLAYER_H
+
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+struct Conv3dInfo;
+class Status;
+
+/** Basic function to compute the convolution3d layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::ClDirectConv3d
+ */
+class CLConv3D : public IFunction
+{
+public:
+    /** Construtor */
+    CLConv3D();
+    /** Destructor */
+    ~CLConv3D();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConv3D(const CLConv3D &) = delete;
+    /** Default move constructor */
+    CLConv3D(CLConv3D &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConv3D &operator=(const CLConv3D &) = delete;
+    /** Default move assignment operator */
+    CLConv3D &operator=(CLConv3D &&) = default;
+    /** Set the src and dst tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
+     *                             while every optional dimension from 5 and above represent a batch of srcs.
+     * @param[in]  weights         Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d].
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
+     * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *src,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *dst,
+                   const Conv3dInfo       &conv3d_info);
+    /** Set the src and dst tensors.
+     *
+     * Similar to CLConv3D::configure() but using the default compile context
+     *
+     */
+    void configure(const ICLTensor  *src,
+                   const ICLTensor  *weights,
+                   const ICLTensor  *biases,
+                   ICLTensor        *dst,
+                   const Conv3dInfo &conv3d_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLConv3D
+     *
+     * Similar to CLConv3D::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCONVOLUTION3DLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
index 123f6380bb..409430d595 100644
--- a/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,29 +24,55 @@
 #ifndef ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_CLCONVERTFULLYCONNECTEDWEIGHTS_H
 
-#include "arm_compute/core/CL/kernels/CLConvertFullyConnectedWeightsKernel.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/ITransformWeights.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLConvertFullyConnectedWeightsKernel. */
-class CLConvertFullyConnectedWeights : public ICLSimpleFunction
+/** Basic function to run an @ref opencl::kernels::ClConvertFullyConnectedWeightsKernel. */
+class CLConvertFullyConnectedWeights : public IFunction
 {
 public:
+    /** Constructor */
+    CLConvertFullyConnectedWeights();
+    /** Destructor */
+    ~CLConvertFullyConnectedWeights();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvertFullyConnectedWeights(const CLConvertFullyConnectedWeights &) = delete;
+    /** Default move constructor */
+    CLConvertFullyConnectedWeights(CLConvertFullyConnectedWeights &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvertFullyConnectedWeights &operator=(const CLConvertFullyConnectedWeights &) = delete;
+    /** Default move assignment operator */
+    CLConvertFullyConnectedWeights &operator=(CLConvertFullyConnectedWeights &&) = default;
     /** Initialize the function.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
-     *
-     * @return A status
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
     /** Initialize the function.
      *
      * @param[in]  compile_context      The compile context to be used.
@@ -54,10 +80,12 @@ public:
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
-     *
-     * @return A status
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const TensorShape      &original_input_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -65,12 +93,22 @@ public:
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 namespace weights_transformations
 {
-/** Basic function to run @ref CLConvertFullyConnectedWeightsKernel. */
+/** Basic function to manage @ref CLConvertFullyConnectedWeights. */
 class CLConvertFullyConnectedWeightsManaged : public ITransformWeights
 {
 public:
@@ -116,7 +154,10 @@ public:
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const TensorShape      &original_input_shape,
+                   DataLayout              data_layout)
     {
         _func.configure(compile_context, input, &_output, original_input_shape, data_layout);
     }
diff --git a/arm_compute/runtime/CL/functions/CLConvolution.h b/arm_compute/runtime/CL/functions/CLConvolution.h
deleted file mode 100644
index 72ef8ce7b8..0000000000
--- a/arm_compute/runtime/CL/functions/CLConvolution.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLCONVOLUTION_H
-#define ARM_COMPUTE_CLCONVOLUTION_H
-
-#include "arm_compute/core/CL/kernels/CLConvolutionKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute convolution of size 3x3. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLConvolution3x3Kernel
- *
- */
-class CLConvolution3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-
-/** Basic function to execute square convolution.Currently it supports 5x5, 7x7, 9x9. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLConvolutionKernel or<br/>
- *    @ref CLSeparableConvolutionHorKernel and @ref CLSeparableConvolutionVertKernel (if convolution matrix is separable)
- *
- */
-template <unsigned int matrix_size>
-class CLConvolutionSquare : public IFunction
-{
-public:
-    /** Default constructor */
-    CLConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overriden:
-    void run() override;
-
-private:
-    MemoryGroup                                   _memory_group;   /**< Function's memory group */
-    CLTensor                                      _tmp;            /**< temporary buffer for output of horizontal pass */
-    bool                                          _is_separable;   /**< true if the convolution can be separated */
-    CLSeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
-    CLSeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
-    CLConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
-    CLFillBorderKernel                            _border_handler; /**< kernel for border handling */
-};
-
-/** Basic function to run 5x5 convolution. */
-using CLConvolution5x5 = CLConvolutionSquare<5>;
-/** Basic function to run 7x7 convolution. */
-using CLConvolution7x7 = CLConvolutionSquare<7>;
-/** Basic function to run 9x9 convolution. */
-using CLConvolution9x9 = CLConvolutionSquare<9>;
-
-/** Basic function to execute non-square convolution. This function calls the following CL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLConvolutionRectangleKernel or<br/>
- *
- * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
- */
-class CLConvolutionRectangle : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     rows                  Rows of convolution kernel.
-     * @param[in]     cols                  Columns of convolution kernel.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     rows                  Rows of convolution kernel.
-     * @param[in]     cols                  Columns of convolution kernel.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLCONVOLUTION_H */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index fff9173210..8487be71c3 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,13 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h"
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
@@ -35,11 +35,15 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
  *
- * -# @ref CLGEMMConvolutionLayer
- * -# @ref CLWinogradConvolutionLayer
- * -# @ref CLDirectConvolutionLayer
+ * -# @ref opencl::ClGemmConv2d
+ * -# @ref opencl::ClWinogradConv2d
+ * -# @ref opencl::ClDirectConv2d
  * -# @ref CLFFTConvolutionLayer
  *
  * The function selects one of the algorithms mentioned above based on:
@@ -73,15 +77,39 @@ class CLConvolutionLayer : public IFunction
 public:
     /** Default constructor */
     CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLConvolutionLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionLayer(const CLConvolutionLayer &) = delete;
+    /** Default move constructor */
+    CLConvolutionLayer(CLConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLConvolutionLayer &operator=(const CLConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    CLConvolutionLayer &operator=(CLConvolutionLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                              Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                              Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                              Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -92,8 +120,16 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context  The compile context to be used.
@@ -101,9 +137,9 @@ public:
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                              Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                              Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                              Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -114,17 +150,26 @@ public:
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                   unsigned int num_groups = 1);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
+     *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -137,16 +182,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
@@ -157,17 +209,24 @@ public:
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
      *
-     * @return a status
+     * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation = Size2D(1U, 1U), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const ActivationLayerInfo &act_info,
+                                                    const GPUTarget            gpu_target,
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    std::shared_ptr<IMemoryManager> _memory_manager;
-    std::unique_ptr<IFunction>      _function;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
-#endif /* ARM_COMPUTE_CLCONVOLUTIONLAYER_H */
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLCopy.h b/arm_compute/runtime/CL/functions/CLCopy.h
index 31b73c33c3..fd40b7b9de 100644
--- a/arm_compute/runtime/CL/functions/CLCopy.h
+++ b/arm_compute/runtime/CL/functions/CLCopy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,40 +25,76 @@
 #define ARM_COMPUTE_CLCOPY_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-class CLCopy : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClCopyKernel */
+class CLCopy : public IFunction
 {
 public:
+    /** Constructor */
+    CLCopy();
+    /** Destructor */
+    ~CLCopy();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopy(const CLCopy &) = delete;
+    /** Default move constructor */
+    CLCopy(CLCopy &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCopy &operator=(const CLCopy &) = delete;
+    /** Default move assignment operator */
+    CLCopy &operator=(CLCopy &&);
     /** Initialise the function's source and destination.
      *
-     * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[out] output Output tensor. Data types supported: Same as @p input.
+     * Valid data layouts:
+     * - All
      *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in]  input      Source tensor. Data types supported: All.
+     * @param[out] output     Output tensor. Data types supported: Same as @p input.
+     * @param[in]  dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(ICLTensor *input, ICLTensor *output);
+    void configure(ICLTensor *input, ICLTensor *output, Window *dst_window = nullptr);
     /** Initialise the function's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+     * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
+     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopy
      *
-     * @param[in] input  Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-     * @param[in] output Output tensor. Data types supported: Same as @p input.
+     * @param[in] input      Source tensor. Data types supported: All.
+     * @param[in] output     Output tensor. Data types supported: Same as @p input.
+     * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Window *dst_window = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLCOPY_H */
diff --git a/arm_compute/core/CL/kernels/CLCropKernel.h b/arm_compute/runtime/CL/functions/CLCrop.h
index cbc2338940..2942e9362a 100644
--- a/arm_compute/core/CL/kernels/CLCropKernel.h
+++ b/arm_compute/runtime/CL/functions/CLCrop.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,34 +21,49 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLCROPKERNEL_H
-#define ARM_COMPUTE_CLCROPKERNEL_H
+#ifndef ARM_COMPUTE_CLCROP_H
+#define ARM_COMPUTE_CLCROP_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** OpenCL kernel to perform a copy between two tensors */
-class CLCropKernel : public ICLKernel
+/** Basic function to run @ref opencl::kernels::ClCropKernel */
+class CLCrop : public IFunction
 {
 public:
-    /** Default constructor */
-    CLCropKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCropKernel(const CLCropKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers). */
-    CLCropKernel &operator=(const CLCropKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLCropKernel(CLCropKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLCropKernel &operator=(CLCropKernel &&) = default;
-    /** Configure kernel
+    /** Constructor */
+    CLCrop();
+    /** Destructor */
+    ~CLCrop();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCrop(const CLCrop &) = delete;
+    /** Default move constructor */
+    CLCrop(CLCrop &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLCrop &operator=(const CLCrop &) = delete;
+    /** Default move assignment operator */
+    CLCrop &operator=(CLCrop &&);
+    /** Configure function
      *
      * @note Supported tensor rank: up to 4
      *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |F32            |
+     *
      * @param[in]  input               Source tensor. Data type supported: All. Data layouts supported: NHWC.
      * @param[out] output              Destination tensor. Data type supported: F32
      * @param[in]  start               Coordinates of where to start cropping the image.
@@ -57,8 +72,14 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, Window *output_window = nullptr);
-    /** Configure kernel
+    void configure(const ICLTensor *input,
+                   ICLTensor       *output,
+                   Coordinates2D    start,
+                   Coordinates2D    end,
+                   uint32_t         batch_index,
+                   float            extrapolation_value = 0,
+                   Window          *output_window       = nullptr);
+    /** Configure function
      *
      * @note Supported tensor rank: up to 4
      *
@@ -71,8 +92,14 @@ public:
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *output_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *output_window       = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -86,18 +113,20 @@ public:
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in] output_window       Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *output_window = nullptr);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *output_window       = nullptr);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void run() override;
 
 private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
-    Coordinates2D    _start;
-    uint32_t         _batch_index;
-    float            _extrapolation_value;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLCROPKERNEL_H */
+#endif /*ARM_COMPUTE_CLCROP_H */
diff --git a/arm_compute/runtime/CL/functions/CLCropResize.h b/arm_compute/runtime/CL/functions/CLCropResize.h
index ea50fe51c7..6fb055e893 100644
--- a/arm_compute/runtime/CL/functions/CLCropResize.h
+++ b/arm_compute/runtime/CL/functions/CLCropResize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,10 +25,10 @@
 #define ARM_COMPUTE_CL_CROP_RESIZE_H
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLCropKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLCrop.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/CL/functions/CLScale.h"
 
 #include <cstdint>
@@ -37,7 +37,9 @@
 namespace arm_compute
 {
 // Forward Declarations
+class CLCompileContext;
 class ITensor;
+class ITensorInfo;
 
 /** Function to perform cropping and resizing */
 class CLCropResize : public IFunction
@@ -54,10 +56,18 @@ public:
     /** Allow instances of this class to be moved */
     CLCropResize &operator=(CLCropResize &&) = default;
     /** Default destructor */
-    virtual ~CLCropResize() = default;
+    ~CLCropResize();
 
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |src2   |dst      |
+     * |:--------|:--------|:------|:--------|
+     * |All      |F32      |F32    |F32      |
+     *
      * @note Supported tensor rank: up to 4
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
      * @note Start and end indices of boxes are inclusive.
@@ -71,8 +81,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ICLTensor    *input,
+                   ICLTensor          *boxes,
+                   ICLTensor          *box_ind,
+                   ICLTensor          *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -89,8 +104,14 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *boxes,
+                   ICLTensor              *box_ind,
+                   ICLTensor              *output,
+                   Coordinates2D           crop_size,
+                   InterpolationPolicy     method              = InterpolationPolicy::BILINEAR,
+                   float                   extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -110,8 +131,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           ITensorInfo        *boxes,
+                           ITensorInfo        *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
@@ -123,12 +149,12 @@ public:
     InterpolationPolicy _method;
     float               _extrapolation_value;
 
-    std::vector<std::unique_ptr<CLScale>>      _scale;
-    std::vector<std::unique_ptr<CLCopyKernel>> _copy;
-    std::vector<std::unique_ptr<CLTensor>>     _crop_results;
-    std::vector<std::unique_ptr<CLTensor>>     _scaled_results;
+    std::vector<std::unique_ptr<CLScale>>  _scale;
+    std::vector<std::unique_ptr<CLCopy>>   _copy;
+    std::vector<std::unique_ptr<CLTensor>> _crop_results;
+    std::vector<std::unique_ptr<CLTensor>> _scaled_results;
 
-    std::vector<std::unique_ptr<ICLKernel>> _internal_kernels;
+    std::vector<std::unique_ptr<IFunction>> _internal_functions;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_CROP_RESIZE_H */
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
index c75b586132..92f87ee461 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -44,46 +44,80 @@ public:
     /** Default constructor */
     CLDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
+    ~CLDeconvolutionLayer();
+
     /** Set the input, weights, biases and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
      * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same as @p input.
+     * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
      * @param[out]    output       Output tensor. The output has the same number of dimensions as the @p input.
      * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
-     * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+     * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ICLTensor           *input,
+                   ICLTensor           *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &deconv_info,
+                   const WeightsInfo   &weights_info = WeightsInfo());
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
      * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]     bias            (Optional) The biases have one dimension. Data type supported: Same as @p input.
+     * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]     bias            (Optional) The biases have one dimension. Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
      * @param[out]    output          Output tensor. The output has the same number of dimensions as the @p input.
      * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
-     * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+     * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &deconv_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
      * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as @p input.
+     * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
      * @param[in] output       Output tensor info. The output has the same number of dimensions as the @p input.
      * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
-     * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+     * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                           const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           ITensorInfo         *output,
+                           const PadStrideInfo &deconv_info,
+                           const WeightsInfo   &weights_info = WeightsInfo());
 
-    static DeconvolutionMethod get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                        const WeightsInfo &weights_info);
+    static DeconvolutionMethod get_deconvolution_method(const ITensorInfo   *input,
+                                                        const ITensorInfo   *weights,
+                                                        const ITensorInfo   *bias,
+                                                        ITensorInfo         *output,
+                                                        const PadStrideInfo &deconv_info,
+                                                        const WeightsInfo   &weights_info);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
@@ -91,6 +125,9 @@ public:
 private:
     std::shared_ptr<IMemoryManager> _memory_manager;
     std::unique_ptr<IFunction>      _function;
+
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDECONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index 2d3dde1ea0..5a2abafe79 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,23 @@
 #ifndef ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H
 #define ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 // Forward declarations
+class CLDeconvolutionLayerUpsampleKernel;
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions:
  *
- * -# @ref CLMemsetKernel
+ * -# @ref CLFill
  * -# @ref CLDeconvolutionLayerUpsampleKernel
  */
 class CLDeconvolutionLayerUpsample : public IFunction
@@ -55,10 +57,19 @@ public:
     /** Allow instances of this class to be moved */
     CLDeconvolutionLayerUpsample &operator=(CLDeconvolutionLayerUpsample &&) = default;
     /** Default destructor */
-    virtual ~CLDeconvolutionLayerUpsample() = default;
+    ~CLDeconvolutionLayerUpsample();
 
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in, out] input  Source tensor. Data type supported: All.
      * @param[out]     output Destination tensor. Data type supported: same as @p input.
      * @param[in]      info   Contains padding and policies to be used in the deconvolution.
@@ -71,7 +82,8 @@ public:
      * @param[out]     output          Destination tensor. Data type supported: same as @p input.
      * @param[in]      info            Contains padding and policies to be used in the deconvolution.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data type supported: All.
@@ -86,9 +98,9 @@ public:
     void run() override;
 
 private:
-    CLDeconvolutionLayerUpsampleKernel _upsample;
-    CLMemsetKernel                     _memset;
-    ICLTensor                         *_output;
+    std::unique_ptr<CLDeconvolutionLayerUpsampleKernel> _upsample;
+    CLFill                                              _fill;
+    ICLTensor                                          *_output;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H */
diff --git a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
index 910b9eac51..3e7ca8830b 100644
--- a/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthConvertLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,31 +25,49 @@
 #define ARM_COMPUTE_CLDEPTHCONVERT_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLDepthConvertLayerKernel. */
-class CLDepthConvertLayer : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClCastKernel */
+class CLDepthConvertLayer : public IFunction
 {
 public:
+    /** Constructor */
+    CLDepthConvertLayer();
+    /** Destructor */
+    ~CLDepthConvertLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConvertLayer(const CLDepthConvertLayer &) = delete;
+    /** Default move constructor */
+    CLDepthConvertLayer(CLDepthConvertLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDepthConvertLayer &operator=(const CLDepthConvertLayer &) = delete;
+    /** Default move assignment operator */
+    CLDepthConvertLayer &operator=(CLDepthConvertLayer &&);
     /** Initialize the function's source, destination
      *
-     * Input data type must be different than output data type.
+     * Valid data layouts:
+     * - All
      *
-     * Valid conversions Input -> Output :
+     * Valid data type configurations:
+     * |src            |dst                                   |
+     * |:--------------|:-------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32     |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32      |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32      |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32      |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32      |
+     * |F16            | U8, S8, U16, S16, U32, F32           |
+     * |F32            | U8, S8, U16, S16, U32, F16           |
      *
-     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
-     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
-     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
-     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
-     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
-     *   - F16 -> U8, S8, U16, S16, U32, F32
-     *   - F32 -> U8, S8, U16, S16, U32, F16
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
@@ -77,7 +95,11 @@ public:
      * @param[in]  policy          Conversion policy.
      * @param[in]  shift           Value for down/up conversions. Must be 0 <= shift < 8.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   ConvertPolicy           policy,
+                   uint32_t                shift);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayer
      *
      * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
@@ -88,6 +110,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHCONVERT_H*/
diff --git a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
index dbf5898319..14d0a7ec7c 100644
--- a/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLDepthToSpaceLayerKernel. */
 class CLDepthToSpaceLayer : public ICLSimpleFunction
@@ -37,6 +39,15 @@ class CLDepthToSpaceLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape Block shape value.
@@ -49,7 +60,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayer.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -60,5 +72,5 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDEPTHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
index f15271b63c..2c0fa7aa22 100644
--- a/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,8 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H
 
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLDepthwiseConvolutionLayerReshapeWeightsKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/ICLDepthwiseConvolutionLayer3x3Kernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -38,9 +33,15 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLDepthwiseConvolutionLayerNativeKernel;
 class ICLTensor;
 
 /** Function to execute a depthwise convolution
+ *
+ * -# @ref CLDepthwiseConvolutionLayerNativeKernel
+ * -# @ref CLPermute (if the data layout is NCHW)
+ *
  */
 class CLDepthwiseConvolutionLayer : public IFunction
 {
@@ -55,288 +56,105 @@ public:
     CLDepthwiseConvolutionLayer &operator=(const CLDepthwiseConvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLDepthwiseConvolutionLayer &operator=(CLDepthwiseConvolutionLayer &&) = default;
+    /** Default destructor */
+    ~CLDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
      *
-     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]      compile_context  The compile context to be used.
+     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/FP16/FP32. Data layout supported: NHWC, NCHW
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                                  Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out]     output           Destination tensor. Data type supported: same as @p input.
+     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
+     * @param[out]     output           Destination tensor. Pass in nullptr or @p input for in-place operation. Data type supported: same as @p input.
      * @param[in]      conv_info        Padding and stride information to use for the convolution.
      * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     *
+     * @note: For in-place support, please check @ref CLDepthwiseConvolutionLayerNativeKernel
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                   ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &conv_info,
+                   unsigned int            depth_multiplier = 1,
+                   ActivationLayerInfo     act_info         = ActivationLayerInfo(),
+                   const Size2D           &dilation         = Size2D(1U, 1U));
+
     /** Initialize the function's source, destination, weights and convolution information.
      *
-     * @param[in]      compile_context  The compile context to be used.
-     * @param[in, out] input            Source tensor. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
-     * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]      conv_info        Padding and stride information to use for the convolution.
-     * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * Similar to @ref CLDepthwiseConvolutionLayer::configure()
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ICLTensor           *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *biases,
+                   ICLTensor           *output,
+                   const PadStrideInfo &conv_info,
+                   unsigned int         depth_multiplier = 1,
+                   ActivationLayerInfo  act_info         = ActivationLayerInfo(),
+                   const Size2D        &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer
      *
-     * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[in] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * Similar to @ref CLDepthwiseConvolutionLayer::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                           ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &conv_info,
+                           unsigned int         depth_multiplier = 1,
+                           ActivationLayerInfo  act_info         = ActivationLayerInfo(),
+                           const Size2D        &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
     void prepare() override;
 
-private:
-    /** Static function to choose the best depthwise convolution function for @ref CLDepthwiseConvolutionLayer
-     *
-     * @param[in] input            Source tensor info. Data type supported: QASYMM8/FP16/FP32. Data layout supported: NHWC, NCHW
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
-     * @param[in] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] gpu_target       (Optional) GPU target to validate the kernel for. Defaults to midgard.
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                                                                          ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U), GPUTarget gpu_target = GPUTarget::MIDGARD);
-
-    /** Basic function to execute a depthwise convolution for kernel size 3x3xC (when data layout NCHW) or Cx3x3 (when data layout NHWC). This function calls the following OpenCL kernels:
-    *
-    * -# @ref CLDepthwiseConvolutionLayer3x3NCHWKernel (if data_layout == NCHW)
-    * -# @ref CLDepthwiseConvolutionLayer3x3NHWCKernel (if data_layout == NHWC)
-    * -# @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel (if data_layout == NHWC)
-    * -# @ref CLFillBorderKernel (if pad_x or pad_y > 0)
-    *
-    */
-    class CLDepthwiseConvolutionLayerInternal3x3 : public IFunction
+    void set_memory_group(std::shared_ptr<IMemoryManager> memory_manager)
     {
-    public:
-        /** Default constructor */
-        CLDepthwiseConvolutionLayerInternal3x3(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CLDepthwiseConvolutionLayerInternal3x3(const CLDepthwiseConvolutionLayerInternal3x3 &) = delete;
-        /** Default move constructor */
-        CLDepthwiseConvolutionLayerInternal3x3(CLDepthwiseConvolutionLayerInternal3x3 &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CLDepthwiseConvolutionLayerInternal3x3 &operator=(const CLDepthwiseConvolutionLayerInternal3x3 &) = delete;
-        /** Default move assignment operator */
-        CLDepthwiseConvolutionLayerInternal3x3 &operator=(CLDepthwiseConvolutionLayerInternal3x3 &&) = default;
-        /** Initialize the function's source, destination, conv and border_size.
-         *
-         * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM].
-         *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-         * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input.
-         * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      conv_info        Padding and stride information to use for the convolution.
-         * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
-         * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         */
-        void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                       ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-        /** Initialize the function's source, destination, conv and border_size.
-         *
-         * @param[in]      compile_context  The compile context to be used.
-         * @param[in, out] input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM].
-         *                                  Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-         * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input.
-         * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      conv_info        Padding and stride information to use for the convolution.
-         * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
-         * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         */
-        void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayer3x3
-         *
-         * @param[in] input            Source tensor info. Data type supported: QASYMM8 for all layouts, F16/F32 for NCHW.
-         * @param[in] weights          Weights tensor info. A 3D tensor with shape [3, 3, IFM].
-         *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-         * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
-         * @param[in] output           Destination tensor. Data type supported: same as @p input.
-         * @param[in] conv_info        Padding and stride information to use for the convolution.
-         * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 QASYMM8 supported.
-         * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                               ActivationLayerInfo act_info = ActivationLayerInfo(), GPUTarget gpu_target = GPUTarget::MIDGARD, const Size2D &dilation = Size2D(1U, 1U));
-
-        // Inherited methods overriden:
-        void run() override;
-        void prepare() override;
-
-        void set_memory_group(std::shared_ptr<IMemoryManager> memory_manager)
-        {
-            _memory_group = MemoryGroup(std::move(memory_manager));
-        };
-
-    private:
-        MemoryGroup                                            _memory_group;
-        std::unique_ptr<ICLDepthwiseConvolutionLayer3x3Kernel> _kernel;
-        CLFillBorderKernel                                     _border_handler;
-        CLPermute                                              _permute_input_to_nchw;
-        CLPermute                                              _permute_weights_to_nchw;
-        CLPermute                                              _permute_output_to_nhwc;
-        CLDepthwiseConvolutionLayerReshapeWeightsKernel        _reshape_weights;
-        CLTensor                                               _permuted_input;
-        CLTensor                                               _permuted_weights;
-        CLTensor                                               _permuted_output;
-        CLTensor                                               _output_multipliers;
-        CLTensor                                               _output_shifts;
-        const ITensor                                         *_original_weights;
-        const ITensor                                         *_input;
-        const ITensor                                         *_output;
-        bool                                                   _needs_permute;
-        bool                                                   _needs_weights_reshape;
-        bool                                                   _is_prepared;
-        bool                                                   _is_quantized;
+        _memory_group = MemoryGroup(std::move(memory_manager));
     };
 
-    /** Basic function to execute a generic depthwise convolution. This function calls the following OpenCL kernels:
-     *
-     * -# @ref CLDepthwiseConvolutionLayerNativeKernel
-     * -# @ref CLPermute (x 3) if the data layout is NCHW
-     *
-     */
-    class CLDepthwiseConvolutionLayerGeneric : public IFunction
-    {
-    public:
-        /** Default constructor */
-        CLDepthwiseConvolutionLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CLDepthwiseConvolutionLayerGeneric(const CLDepthwiseConvolutionLayerGeneric &) = delete;
-        /** Default move constructor */
-        CLDepthwiseConvolutionLayerGeneric(CLDepthwiseConvolutionLayerGeneric &&) = default;
-        /** Prevent instances of this class from being copied (As this class contains pointers) */
-        CLDepthwiseConvolutionLayerGeneric &operator=(const CLDepthwiseConvolutionLayerGeneric &) = delete;
-        /** Default move assignment operator */
-        CLDepthwiseConvolutionLayerGeneric &operator=(CLDepthwiseConvolutionLayerGeneric &&) = default;
-        /** Initialize the function's source, destination, weights and convolution information.
-         *
-         * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                                  Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-         * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      conv_info        Padding and stride information to use for the convolution.
-         * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
-         * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         */
-        void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-        /** Initialize the function's source, destination, weights and convolution information.
-         *
-         * @param[in]      compile_context  The compile context to be used.
-         * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                                  Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-         * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      conv_info        Padding and stride information to use for the convolution.
-         * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
-         * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         */
-        void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
-        /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerGeneric
-         *
-         * @param[in] input            Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F32.
-         * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-         *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-         * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[in] output           Destination tensor. Data type supported: same as @p input.
-         * @param[in] conv_info        Padding and stride information to use for the convolution.
-         * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-         * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         *
-         * @return a status
-         */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
-        // Inherited methods overriden:
-        void run() override;
-        void prepare() override;
-
-        void set_memory_group(std::shared_ptr<IMemoryManager> memory_manager)
-        {
-            _memory_group = MemoryGroup(std::move(memory_manager));
-        };
-
-    private:
-        MemoryGroup _memory_group;
-
-        CLDepthwiseConvolutionLayerNativeKernel _dwc_native_kernel;
-        CLPermute                               _permute_input_to_nhwc;
-        CLPermute                               _permute_weights_to_nhwc;
-        CLPermute                               _permute_output_to_nchw;
-
-        CLTensor       _permuted_input;
-        CLTensor       _permuted_weights;
-        CLTensor       _permuted_output;
-        CLTensor       _output_multipliers;
-        CLTensor       _output_shifts;
-        const ITensor *_original_weights;
-        const ITensor *_input;
-        const ITensor *_output;
-
-        bool _needs_permute;
-        bool _is_prepared;
-        bool _is_quantized;
-    };
-
-    std::shared_ptr<IMemoryManager> _memory_manager;
-
-    DepthwiseConvolutionFunction           _depth_conv_func;
-    CLDepthwiseConvolutionLayerInternal3x3 _func_3x3;
-    CLDepthwiseConvolutionLayerGeneric     _func_generic;
+private:
+    MemoryGroup _memory_group;
+
+    std::unique_ptr<CLDepthwiseConvolutionLayerNativeKernel> _dwc_native_kernel;
+    CLPermute                                                _permute_input_to_nhwc;
+    CLPermute                                                _permute_weights_to_nhwc;
+    CLPermute                                                _permute_output_to_nchw;
+
+    CLTensor       _permuted_input;
+    CLTensor       _permuted_weights;
+    CLTensor       _permuted_output;
+    CLTensor       _output_multipliers;
+    CLTensor       _output_shifts;
+    const ITensor *_original_weights;
+    const ITensor *_input;
+    const ITensor *_output;
+
+    bool _needs_permute;
+    bool _is_prepared;
+    bool _is_quantized;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTION_H */
diff --git a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
index c0a0fcd988..b01fe9eb14 100644
--- a/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDequantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,48 @@
 #ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYER_H
 #define ARM_COMPUTE_CLDEQUANTIZATIONLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLDequantizationLayerKernel that dequantizes an input tensor */
-class CLDequantizationLayer : public ICLSimpleFunction
+/** Basic function to run @ref opencl::ClDequantize that dequantizes an input tensor */
+class CLDequantizationLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLDequantizationLayer();
+    /** Default Destructor */
+    ~CLDequantizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDequantizationLayer(const CLDequantizationLayer &) = delete;
+    /** Default move constructor */
+    CLDequantizationLayer(CLDequantizationLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDequantizationLayer &operator=(const CLDequantizationLayer &) = delete;
+    /** Default move assignment operator */
+    CLDequantizationLayer &operator=(CLDequantizationLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst       |
+     * |:------------------|:---------|
+     * |QASYMM8            |F16, F32  |
+     * |QASYMM8_SIGNED     |F16, F32  |
+     * |QSYMM8_PER_CHANNEL |F16, F32  |
+     * |QSYMM8             |F16, F32  |
+     * |QSYMM16            |F16, F32  |
+     *
      * @param[in]  input  Source tensor with at least 3 dimensions. The dimensions over the third will be interpreted as batches.
      *                    Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
@@ -60,6 +87,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDEQUANTIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDerivative.h b/arm_compute/runtime/CL/functions/CLDerivative.h
deleted file mode 100644
index 5875ceb86d..0000000000
--- a/arm_compute/runtime/CL/functions/CLDerivative.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDERIVATIVE_H
-#define ARM_COMPUTE_CLDERIVATIVE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute first order derivative operator. This function calls the following CL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLDerivativeKernel
- *
- */
-class CLDerivative : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination tensor. Derivative along the X direction. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination tensor. Derivative along the X direction. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination tensor. Derivative along the Y direction. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /* ARM_COMPUTE_CLDERIVATIVE_H */
diff --git a/arm_compute/runtime/CL/functions/CLDilate.h b/arm_compute/runtime/CL/functions/CLDilate.h
deleted file mode 100644
index cc84820f9f..0000000000
--- a/arm_compute/runtime/CL/functions/CLDilate.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLDILATE_H
-#define ARM_COMPUTE_CLDILATE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute dilate. This function calls the following OpenCL kernels:
-*
-* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
-* -# @ref CLDilateKernel
-*
-*/
-class CLDilate : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and border mode.
-     *
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the kernel's inputs, output and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLDILATE_H */
diff --git a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
index 0c81ffa460..84900b03a3 100644
--- a/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 
@@ -34,17 +33,41 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute direct convolution function:
  */
 class CLDirectConvolutionLayer : public IFunction
 {
 public:
-    /** Default constructor */
+    /** Constructor */
     CLDirectConvolutionLayer();
+    /** Destructor */
+    ~CLDirectConvolutionLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDirectConvolutionLayer(const CLDirectConvolutionLayer &) = delete;
+    /** Default move constructor */
+    CLDirectConvolutionLayer(CLDirectConvolutionLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLDirectConvolutionLayer &operator=(const CLDirectConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    CLDirectConvolutionLayer &operator=(CLDirectConvolutionLayer &&);
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
+     *
      * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                       while every optional dimension from 4 and above represent a batch of inputs.
      *                       Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
@@ -56,7 +79,12 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +99,12 @@ public:
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDirectConvolutionLayer
      *
@@ -88,18 +121,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLDirectConvolutionLayerKernel _direct_conv_kernel;
-    CLFillBorderKernel             _input_border_handler;
-    CLActivationLayer              _activationlayer_function;
-
-    bool _is_activationlayer_enabled;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index 1fed460e69..14384a09b5 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,11 @@
 #ifndef ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLDIRECTDECONVOLUTIONLAYER_H
 
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
 #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -87,47 +86,75 @@ public:
     CLDirectDeconvolutionLayer &operator=(CLDirectDeconvolutionLayer &&) = default;
     /** Set the input, weights, biases and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
      * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
      *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]     bias         (Optional) The biases have one dimension.
      *                             Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
      * @param[out]    output       Output tensor. The output has the same number of dimensions as the @p input.
      * @param[in]     info         Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
-     * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+     * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info, const WeightsInfo &weights_info = WeightsInfo());
+    void configure(ICLTensor           *input,
+                   ICLTensor           *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &info,
+                   const WeightsInfo   &weights_info = WeightsInfo());
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
      * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
      *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]     bias            (Optional) The biases have one dimension.
      *                                Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
      * @param[out]    output          Output tensor. The output has the same number of dimensions as the @p input.
      * @param[in]     info            Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
-     * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+     * @param[in]     weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLDirectDeconvolutionLayer
      *
      * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
      *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input or QSYMM8_PER_CHANNEL if @p input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] bias         (Optional) The biases have one dimension.
      *                         Data type supported: Should match @p input data type, except for input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
      * @param[in] output       Output tensor info. The output has the same number of dimensions as the @p input.
      * @param[in] info         Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
-     * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+     * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref opencl::kernels::ClWeightsReshapeKernel.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                           const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           ITensorInfo         *output,
+                           const PadStrideInfo &info,
+                           const WeightsInfo   &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
index 8c656ed8bc..13844c98a1 100644
--- a/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseOperations.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,111 +24,291 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 #define ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 class ICLTensor;
+class CLCompileContext;
+class ITensorInfo;
 
-/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for addition
+/** Basic function to run @ref opencl::kernels::ClSaturatedArithmeticKernel for addition
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
  * @note The function performs an arithmetic addition between two tensors.
  */
-class CLArithmeticAddition : public ICLSimpleFunction
+class CLArithmeticAddition : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLArithmeticAddition();
+    /** Default Destructor */
+    ~CLArithmeticAddition();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAddition(const CLArithmeticAddition &) = delete;
+    /** Default move constructor */
+    CLArithmeticAddition(CLArithmeticAddition &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticAddition &operator=(const CLArithmeticAddition &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticAddition &operator=(CLArithmeticAddition &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for addition
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for addition
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QSYMM16 (only if @p input1 is QSYMM16), S16/F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), QSYMM16 (only if both inputs is QSYMM16), S16/F16/F32.
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for subtraction
+/** Basic function to run @ref opencl::kernels::ClSaturatedArithmeticKernel for subtraction
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/U32/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  */
-class CLArithmeticSubtraction : public ICLSimpleFunction
+class CLArithmeticSubtraction : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLArithmeticSubtraction();
+    /** Default Destructor */
+    ~CLArithmeticSubtraction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtraction(const CLArithmeticSubtraction &) = delete;
+    /** Default move constructor */
+    CLArithmeticSubtraction(CLArithmeticSubtraction &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticSubtraction &operator=(const CLArithmeticSubtraction &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticSubtraction &operator=(CLArithmeticSubtraction &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in]      policy   Policy to use to handle overflow.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16/F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLSaturatedArithmeticOperationKernel for subtraction
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClSaturatedArithmeticKernel for subtraction
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16/F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16/F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
      * @param[in] policy   Policy to use to handle overflow.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLSaturatedArithmeticOperationKernel for division
+/** Basic function to run @ref opencl::kernels::ClSaturatedArithmeticKernel for division
  *
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs an arithmetic division between two tensors.
  */
-class CLArithmeticDivision : public ICLSimpleFunction
+class CLArithmeticDivision : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLArithmeticDivision();
+    /** Default Destructor */
+    ~CLArithmeticDivision();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticDivision(const CLArithmeticDivision &) = delete;
+    /** Default move constructor */
+    CLArithmeticDivision(CLArithmeticDivision &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLArithmeticDivision &operator=(const CLArithmeticDivision &) = delete;
+    /** Default move assignment operator */
+    CLArithmeticDivision &operator=(CLArithmeticDivision &&);
     /** Initialise the kernel's inputs, output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2   Second tensor input. Same as @p input1.
@@ -136,7 +316,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -147,7 +330,11 @@ public:
      * @param[out]     output          Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input1,
+                   const ICLTensor           *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticDivision
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
@@ -157,142 +344,306 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLArithmeticOperationKernel for max
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CLElementwiseMax : public ICLSimpleFunction
+class CLElementwiseMax : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseMax();
+    /** Default Destructor */
+    ~CLElementwiseMax();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMax(const CLElementwiseMax &) = delete;
+    /** Default move constructor */
+    CLElementwiseMax(CLElementwiseMax &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMax &operator=(const CLElementwiseMax &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseMax &operator=(CLElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |U32            |U32            |U32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for max
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for max
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLArithmeticOperationKernel for min
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
  * @note The function performs a max operation between two tensors.
  */
-class CLElementwiseMin : public ICLSimpleFunction
+class CLElementwiseMin : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseMin();
+    /** Default Destructor */
+    ~CLElementwiseMin();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMin(const CLElementwiseMin &) = delete;
+    /** Default move constructor */
+    CLElementwiseMin(CLElementwiseMin &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseMin &operator=(const CLElementwiseMin &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseMin &operator=(CLElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |U32            |U32            |U32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for min
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for min
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLArithmeticOperationKernel for squared difference
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
  *
  * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
  */
-class CLElementwiseSquaredDiff : public ICLSimpleFunction
+class CLElementwiseSquaredDiff : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwiseSquaredDiff();
+    /** Default Destructor */
+    ~CLElementwiseSquaredDiff();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseSquaredDiff(const CLElementwiseSquaredDiff &) = delete;
+    /** Default move constructor */
+    CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwiseSquaredDiff &operator=(const CLElementwiseSquaredDiff &) = delete;
+    /** Default move assignment operator */
+    CLElementwiseSquaredDiff &operator=(CLElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in, out] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2   Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2   Second tensor input. Data types supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output   Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output   Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
+     * @param[in, out] input1          First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          Second tensor input. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          Output tensor. Data types supported: U8 (Only if both inputs are U8), QASYMM8 (only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for squared difference
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for squared difference
      *
-     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/S16/QSYMM16/F16/F32.
-     * @param[in] input2   Second tensor input info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), S16, QSYMM16 (only if @p input1 is QSYMM16), F16/F32.
-     * @param[in] output   Output tensor info. Data types supported: U8 (Only if both inputs are U8), QASYMM8 ( only if both inputs are QASYMM8), S16, QSYMM16 (only if both inputs are QSYMM16), F16/F32.
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] input2   Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output   Output tensor info. Data types supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLArithmeticOperationKernel for power
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
  *
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
  */
-class CLElementwisePower : public ICLSimpleFunction
+class CLElementwisePower : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLElementwisePower();
+    /** Default Destructor */
+    ~CLElementwisePower();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwisePower(const CLElementwisePower &) = delete;
+    /** Default move constructor */
+    CLElementwisePower(CLElementwisePower &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLElementwisePower &operator=(const CLElementwisePower &) = delete;
+    /** Default move assignment operator */
+    CLElementwisePower &operator=(CLElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2   Second tensor input. Data types supported: F16/F32.
@@ -300,7 +651,10 @@ public:
      * @param[out]     output   Output tensor. Data types supported:F16/F32.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and conversion policy.
      *
      * @param[in]      compile_context The compile context to be used.
@@ -311,8 +665,12 @@ public:
      * @param[out]     output          Output tensor. Data types supported:F16/F32.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLArithmeticOperationKernel for power
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
      * @param[in] input2   Second tensor input info. Data types supported: F16/F32.
@@ -321,7 +679,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLELEMENTWISEOPERATIONS_H */
diff --git a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
index 19729b61cc..d186b70d93 100644
--- a/arm_compute/runtime/CL/functions/CLElementWiseUnaryLayer.h
+++ b/arm_compute/runtime/CL/functions/CLElementwiseUnaryLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,44 @@
 #ifndef ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform inverse square root on an input tensor. */
-class CLRsqrtLayer : public ICLSimpleFunction
+class CLRsqrtLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLRsqrtLayer();
+    /** Default Destructor */
+    ~CLRsqrtLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRsqrtLayer(const CLRsqrtLayer &) = delete;
+    /** Default move constructor */
+    CLRsqrtLayer(CLRsqrtLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRsqrtLayer &operator=(const CLRsqrtLayer &) = delete;
+    /** Default move assignment operator */
+    CLRsqrtLayer &operator=(CLRsqrtLayer &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
@@ -55,14 +81,42 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to perform exponential on an input tensor. */
-class CLExpLayer : public ICLSimpleFunction
+class CLExpLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLExpLayer();
+    /** Default Destructor */
+    ~CLExpLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLExpLayer(const CLExpLayer &) = delete;
+    /** Default move constructor */
+    CLExpLayer(CLExpLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLExpLayer &operator=(const CLExpLayer &) = delete;
+    /** Default move assignment operator */
+    CLExpLayer &operator=(CLExpLayer &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
@@ -82,41 +136,98 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to negate an input tensor. */
-class CLNegLayer : public ICLSimpleFunction
+class CLNegLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLNegLayer();
+    /** Default Destructor */
+    ~CLNegLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNegLayer(const CLNegLayer &) = delete;
+    /** Default move constructor */
+    CLNegLayer(CLNegLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLNegLayer &operator=(const CLNegLayer &) = delete;
+    /** Default move assignment operator */
+    CLNegLayer &operator=(CLNegLayer &&);
     /** Initialize the function
      *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
+     *
+     * @param[in]  input  Input tensor. Data types supported: F16/F32/S32
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
     /** Initialize the function
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: F16/F32.
+     * @param[in]  input           Input tensor. Data types supported: F16/F32/S32
      * @param[out] output          Output tensor. Data types supported: same as @p input.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNegLayer
      *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
+     * @param[in] input  First tensor input info. Data types supported: F16/F32/S32
      * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to calculate sine of an input tensor. */
-class CLSinLayer : public ICLSimpleFunction
+class CLSinLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLSinLayer();
+    /** Default Destructor */
+    ~CLSinLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSinLayer(const CLSinLayer &) = delete;
+    /** Default move constructor */
+    CLSinLayer(CLSinLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSinLayer &operator=(const CLSinLayer &) = delete;
+    /** Default move assignment operator */
+    CLSinLayer &operator=(CLSinLayer &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
@@ -136,14 +247,42 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to perform elementwise log on an input tensor. */
-class CLLogLayer : public ICLSimpleFunction
+class CLLogLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLLogLayer();
+    /** Default Destructor */
+    ~CLLogLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogLayer(const CLLogLayer &) = delete;
+    /** Default move constructor */
+    CLLogLayer(CLLogLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogLayer &operator=(const CLLogLayer &) = delete;
+    /** Default move assignment operator */
+    CLLogLayer &operator=(CLLogLayer &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
@@ -163,14 +302,42 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to get the absolute value of an input tensor. */
-class CLAbsLayer : public ICLSimpleFunction
+class CLAbsLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLAbsLayer();
+    /** Default Destructor */
+    ~CLAbsLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLAbsLayer(const CLAbsLayer &) = delete;
+    /** Default move constructor */
+    CLAbsLayer(CLAbsLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLAbsLayer &operator=(const CLAbsLayer &) = delete;
+    /** Default move assignment operator */
+    CLAbsLayer &operator=(CLAbsLayer &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
@@ -190,14 +357,42 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to get the round (to the nearest even) value of an input tensor. */
-class CLRoundLayer : public ICLSimpleFunction
+class CLRoundLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLRoundLayer();
+    /** Default Destructor */
+    ~CLRoundLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRoundLayer(const CLRoundLayer &) = delete;
+    /** Default move constructor */
+    CLRoundLayer(CLRoundLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLRoundLayer &operator=(const CLRoundLayer &) = delete;
+    /** Default move assignment operator */
+    CLRoundLayer &operator=(CLRoundLayer &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Input tensor. Data types supported: F16/F32.
      * @param[out] output Output tensor. Data types supported: same as @p input.
      */
@@ -217,6 +412,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLELEMENTWISEUNARYLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h b/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
deleted file mode 100644
index d907cfb092..0000000000
--- a/arm_compute/runtime/CL/functions/CLEqualizeHistogram.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H
-#define ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H
-
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-#include "arm_compute/core/CL/kernels/CLTableLookupKernel.h"
-#include "arm_compute/runtime/CL/CLDistribution1D.h"
-#include "arm_compute/runtime/CL/CLLut.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute histogram equalization. This function calls the following CL kernels:
- *
- * -# @ref CLHistogramKernel
- * -# @ref CLTableLookupKernel
- *
- */
-class CLEqualizeHistogram : public IFunction
-{
-public:
-    /** Default Constructor. */
-    CLEqualizeHistogram();
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  input  Input image. Data types supported: U8.
-     * @param[out] output Output of same data type with equalized brightness and contrast.
-     */
-    void configure(const ICLImage *input, ICLImage *output);
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8.
-     * @param[out] output          Output of same data type with equalized brightness and contrast.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLImage *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLHistogramKernel       _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
-    CLHistogramBorderKernel _border_histogram_kernel; /**< Kernel that calculates the histogram on the borders. */
-    CLTableLookupKernel     _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
-    CLDistribution1D        _hist;                    /**< Distribution that holds the histogram of the input image. */
-    CLDistribution1D        _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
-    CLLut                   _cd_lut;                  /**< Holds the equalization lookuptable. */
-    static const uint32_t   max_range = 256;          /**< Histogram range of the internal histograms. */
-    static const uint32_t   nr_bins   = 256;          /**< Histogram bins of the internal histograms. */
-};
-}
-#endif /*ARM_COMPUTE_CLEQUALIZEHISTOGRAM_H */
diff --git a/arm_compute/runtime/CL/functions/CLErode.h b/arm_compute/runtime/CL/functions/CLErode.h
deleted file mode 100644
index 57f701cce2..0000000000
--- a/arm_compute/runtime/CL/functions/CLErode.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLERODE_H
-#define ARM_COMPUTE_CLERODE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute erode. This function calls the following OpenCL kernels:
-*
-* -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
-* -# @ref CLErodeKernel
-*
-*/
-class CLErode : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and border mode
-     *
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the kernel's inputs, output and border mode
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 First tensor input. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLERODE_H */
diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h
index da153225c8..49ecf3c260 100644
--- a/arm_compute/runtime/CL/functions/CLFFT1D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT1D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,17 @@
 #ifndef ARM_COMPUTE_CLFFT1D_H
 #define ARM_COMPUTE_CLFFT1D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLFFTScaleKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
 {
 // Forward declaration
+class CLFFTDigitReverseKernel;
+class CLFFTRadixStageKernel;
+class CLFFTScaleKernel;
 class ICLTensor;
 
 /** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels:
@@ -49,9 +48,28 @@ class CLFFT1D : public IFunction
 public:
     /** Default Constructor */
     CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLFFT1D(const CLFFT1D &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLFFT1D &operator=(const CLFFT1D &) = delete;
+    /** Default move constructor */
+    CLFFT1D(CLFFT1D &&) = default;
+    /** Default move assignment operator */
+    CLFFT1D &operator=(CLFFT1D &&) = default;
+    /** Default destructor */
+    ~CLFFT1D();
     /** Initialise the function's source, destinations and border mode.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     * |F16    |F16    |
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16/F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config FFT related configuration
      */
@@ -59,14 +77,17 @@ public:
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F32.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32.
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config          FFT related configuration
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const FFT1DInfo        &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D.
      *
-     * @param[in] input  Source tensor info. Data types supported: F32.
+     * @param[in] input  Source tensor info. Data types supported: F16/F32.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
      * @param[in] config FFT related configuration
      *
@@ -78,14 +99,14 @@ public:
     void run() override;
 
 protected:
-    MemoryGroup                        _memory_group;
-    CLFFTDigitReverseKernel            _digit_reverse_kernel;
-    std::vector<CLFFTRadixStageKernel> _fft_kernels;
-    CLFFTScaleKernel                   _scale_kernel;
-    CLTensor                           _digit_reversed_input;
-    CLTensor                           _digit_reverse_indices;
-    unsigned int                       _num_ffts;
-    bool                               _run_scale;
+    MemoryGroup                                         _memory_group;
+    std::unique_ptr<CLFFTDigitReverseKernel>            _digit_reverse_kernel;
+    std::vector<std::unique_ptr<CLFFTRadixStageKernel>> _fft_kernels;
+    std::unique_ptr<CLFFTScaleKernel>                   _scale_kernel;
+    CLTensor                                            _digit_reversed_input;
+    CLTensor                                            _digit_reverse_indices;
+    unsigned int                                        _num_ffts;
+    bool                                                _run_scale;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLFFT1D_H */
diff --git a/arm_compute/runtime/CL/functions/CLFFT2D.h b/arm_compute/runtime/CL/functions/CLFFT2D.h
index a113f20f91..b7d15f1602 100644
--- a/arm_compute/runtime/CL/functions/CLFFT2D.h
+++ b/arm_compute/runtime/CL/functions/CLFFT2D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFT2D_H
 #define ARM_COMPUTE_CLFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLFFT1D.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
@@ -46,9 +45,28 @@ class CLFFT2D : public IFunction
 public:
     /** Default Constructor */
     CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLFFT2D(const CLFFT2D &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLFFT2D &operator=(const CLFFT2D &) = delete;
+    /** Default move constructor */
+    CLFFT2D(CLFFT2D &&) = default;
+    /** Default move assignment operator */
+    CLFFT2D &operator=(CLFFT2D &&) = default;
+    /** Default destructor */
+    ~CLFFT2D();
     /** Initialise the function's source, destinations and border mode.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     * |F16    |F16    |
+     *
+     * @param[in]  input  Source tensor. Data types supported: F16/F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config FFT related configuration
      */
@@ -56,14 +74,17 @@ public:
     /** Initialise the function's source, destinations and border mode.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F32.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32.
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config          FFT related configuration
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const FFT2DInfo        &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFT2D.
      *
-     * @param[in] input  Source tensor info. Data types supported: F32.
+     * @param[in] input  Source tensor info. Data types supported: F16/F32.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
      * @param[in] config FFT related configuration
      *
diff --git a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
index 740731950e..ed78bbb7a7 100644
--- a/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
@@ -37,6 +35,7 @@
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -71,55 +70,86 @@ public:
     CLFFTConvolutionLayer &operator=(CLFFTConvolutionLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     * |F16    |F16    |
+     *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported:  F16/F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Set the input and output tensors.
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: F32.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[out] output          Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F16/F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F16/F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLFastCorners.h b/arm_compute/runtime/CL/functions/CLFastCorners.h
deleted file mode 100644
index 1dc87d6a38..0000000000
--- a/arm_compute/runtime/CL/functions/CLFastCorners.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFASTCORNERS_H
-#define ARM_COMPUTE_CLFASTCORNERS_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFastCornersKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute fast corners. This function calls the following CL kernels:
- *
- * -# @ref CLFastCornersKernel
- * -# @ref CLNonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
- * -# @ref CLCopyToArrayKernel
- *
- */
-class CLFastCorners : public IFunction
-{
-public:
-    /** Constructor */
-    CLFastCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLFastCorners(const CLFastCorners &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const CLFastCorners &operator=(const CLFastCorners &) = delete;
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     input                 Source image. Data types supported: U8.
-     * @param[in]     threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]     nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in,out] num_corners           Record number of corners in the array
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in]     input                 Source image. Data types supported: U8.
-     * @param[in]     threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]     nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in,out] num_corners           Record number of corners in the array
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, float threshold, bool nonmax_suppression, ICLKeyPointArray *corners, unsigned int *num_corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup               _memory_group;
-    CLFastCornersKernel       _fast_corners_kernel;
-    CLNonMaximaSuppression3x3 _suppr_func;
-    CLCopyToArrayKernel       _copy_array_kernel;
-    CLImage                   _output;
-    CLImage                   _suppr;
-    Window                    _win;
-    bool                      _non_max;
-    unsigned int             *_num_corners;
-    cl::Buffer                _num_buffer;
-    ICLKeyPointArray         *_corners;
-    uint8_t                   _constant_border_value;
-};
-}
-#endif /*ARM_COMPUTE_CLFASTCORNERS_H */
diff --git a/arm_compute/runtime/CL/functions/CLFill.h b/arm_compute/runtime/CL/functions/CLFill.h
index bb1216054f..be1059761a 100644
--- a/arm_compute/runtime/CL/functions/CLFill.h
+++ b/arm_compute/runtime/CL/functions/CLFill.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,29 +26,74 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
 
-/** Function to run @ref CLMemsetKernel to fill a tensor with a scalar value */
-class CLFill : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClFillKernel */
+class CLFill : public IFunction
 {
 public:
-    /** Initialize the function
+    /** Constructor */
+    CLFill();
+    /** Destructor */
+    ~CLFill();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFill(const CLFill &) = delete;
+    /** Default move constructor */
+    CLFill(CLFill &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFill &operator=(const CLFill &) = delete;
+    /** Default move assignment operator */
+    CLFill &operator=(CLFill &&);
+    /** Initialize the kernel's tensor and filling value
+     *
+     * Valid data layouts:
+     * - All
      *
-     * @param[in,out] tensor         Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[in]     constant_value Constant value to use to fill tensor.
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
+     * @param[in]     constant_value The value used to fill the planes of the tensor
+     * @param[in]     window         Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(ICLTensor *tensor, PixelValue constant_value);
-    /** Initialize the function
+    void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    /** Initialise the kernel's tensor and filling value
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] tensor          Source tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[in]     constant_value  Constant value to use to fill tensor.
+     * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
+     * @param[in]     constant_value  The value used to fill the planes of the tensor
+     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFill
+     *
+     * @param[in] tensor         Source tensor info. Data types supported: All.
+     * @param[in] constant_value The value used to fill the planes of the tensor
+     * @param[in] window         Window to be used in case setting only part of a tensor. Default is nullptr.
+     *
+     * @return a status
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, PixelValue constant_value);
+    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLFILL_H */
diff --git a/arm_compute/runtime/CL/functions/CLFillBorder.h b/arm_compute/runtime/CL/functions/CLFillBorder.h
deleted file mode 100644
index 250806b1d7..0000000000
--- a/arm_compute/runtime/CL/functions/CLFillBorder.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLFILLBORDER_H
-#define ARM_COMPUTE_CLFILLBORDER_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLFillBorderKernel */
-class CLFillBorder : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in,out] tensor                Source tensor. Data types supported: U8/S16
-     * @param[in]     border_width          The border width
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-    /** Initialize the function
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] tensor                Source tensor. Data types supported: U8/S16
-     * @param[in]     border_width          The border width
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
-};
-}
-#endif /*ARM_COMPUTE_FILLBORDER_H */
diff --git a/arm_compute/runtime/CL/functions/CLFlattenLayer.h b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
index 98cf49af48..182e97dc67 100644
--- a/arm_compute/runtime/CL/functions/CLFlattenLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFlattenLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,22 +25,40 @@
 #define ARM_COMPUTE_CLFLATTENLAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to execute flatten. This function calls the following OpenCL kernel:
-*
-* -# @ref CLFlattenLayerKernel
-*
-*/
-class CLFlattenLayer : public ICLSimpleFunction
+/** Basic function to execute flatten */
+class CLFlattenLayer : public IFunction
 {
 public:
+    CLFlattenLayer();
+    /** Destructor */
+    ~CLFlattenLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFlattenLayer(const CLFlattenLayer &) = delete;
+    /** Default move constructor */
+    CLFlattenLayer(CLFlattenLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFlattenLayer &operator=(const CLFlattenLayer &) = delete;
+    /** Default move assignment operator */
+    CLFlattenLayer &operator=(CLFlattenLayer &&);
     /** Initialise the kernel's input and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  First input tensor to flatten with at least 3 dimensions.
      *                    The dimensions above the third will be interpreted as batches. Data types supported: All.
      * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
@@ -66,6 +84,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/runtime/CL/functions/CLFloor.h b/arm_compute/runtime/CL/functions/CLFloor.h
index 2844a5642b..4d3d704857 100644
--- a/arm_compute/runtime/CL/functions/CLFloor.h
+++ b/arm_compute/runtime/CL/functions/CLFloor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,44 @@
 #ifndef ARM_COMPUTE_CLFLOOR_H
 #define ARM_COMPUTE_CLFLOOR_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLFloorKernel */
-class CLFloor : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClFloorKernel */
+class CLFloor : public IFunction
 {
 public:
+    /** Constructor */
+    CLFloor();
+    /** Destructor */
+    ~CLFloor();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFloor(const CLFloor &) = delete;
+    /** Default move constructor */
+    CLFloor(CLFloor &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLFloor &operator=(const CLFloor &) = delete;
+    /** Default move assignment operator */
+    CLFloor &operator=(CLFloor &&);
     /** Set the source, destination of the kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     * |F16    |F16    |
+     *
      * @param[in]  input  Source tensor. Data type supported: F16/F32.
      * @param[out] output Destination tensor. Same as @p input
      */
@@ -57,6 +81,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLFLOOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 188117f674..9fd0b4aaef 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,113 +24,19 @@
 #ifndef ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H
 #define ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with OpenCL. This function calls the following kernels:
- *
- *  -# @ref CLTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class CLFullyConnectedLayerReshapeWeights : public ICLSimpleFunction
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output          Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayerReshapeWeights
-     *
-     * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref CLFullyConnectedLayerReshapeWeights */
-class CLFullyConnectedLayerReshapeWeightsManaged : public ITransformWeights
-{
-public:
-    //Inherited method override
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    //Inherited method override
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    //Inherited method override
-    ICLTensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    //Inherited method override
-    uint32_t uid() override
-    {
-        return _uid;
-    }
-
-    /** Configures the @ref CLFullyConnectedLayerReshapeWeights function
-     *
-     * @param[in] input Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     */
-    void configure(const ICLTensor *input)
-    {
-        configure(CLKernelLibrary::get().get_compile_context(), input);
-    }
-    /** Configures the @ref CLFullyConnectedLayerReshapeWeights function
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input)
-    {
-        _func.configure(compile_context, input, &_output);
-    }
-
-private:
-    static constexpr uint32_t           _uid = 0x0;
-    CLTensor                            _output{};
-    CLFullyConnectedLayerReshapeWeights _func{};
-};
-} // namespace weights_transformations
-
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
- *  -# @ref CLIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref CLFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ *  -# @ref opencl::ClGemm or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -138,7 +44,10 @@ class CLFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
+    /** Default destructor */
+    ~CLFullyConnectedLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLFullyConnectedLayer(const CLFullyConnectedLayer &) = delete;
     /** Default move constructor */
@@ -149,21 +58,17 @@ public:
     CLFullyConnectedLayer &operator=(CLFullyConnectedLayer &&) = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input   Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-     *                     If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                     If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
-     *                     - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                     - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  fc_info (Optional) Fully connected layer additional info
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-    /** Set the input and output tensors.
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -178,25 +83,31 @@ public:
      *                             Data type supported: Same as @p input.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
+                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    /** Set the input and output tensors.
+     *
+     * Similar to @ref CLFullyConnectedLayer
+     */
+    void configure(const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *biases,
+                   ICLTensor              *output,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer
      *
-     * @param[in]  input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
-     *                     If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                     If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output  Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
-     *                     - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                     - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  fc_info (Optional) Fully connected layer additional info
+     * Similar to @ref CLFullyConnectedLayer
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     //Inherited methods override
@@ -204,28 +115,8 @@ public:
     void prepare() override;
 
 private:
-    void configure_fc_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-    void configure_conv_fc(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-    void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const FullyConnectedLayerInfo &fc_info);
-
-    MemoryGroup                                                         _memory_group;
-    IWeightsManager                                                    *_weights_manager;
-    CLConvertFullyConnectedWeights                                      _convert_weights;
-    weights_transformations::CLConvertFullyConnectedWeightsManaged      _convert_weights_managed;
-    weights_transformations::CLFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function;
-    CLFlattenLayer                                                      _flatten_layer;
-    CLFullyConnectedLayerReshapeWeights                                 _reshape_weights_function;
-    CLGEMM                                                              _mm_gemm;
-    CLGEMMLowpMatrixMultiplyCore                                        _mm_gemmlowp;
-    CLTensor                                                            _flatten_output;
-    CLTensor                                                            _converted_weights_output;
-    CLTensor                                                            _reshape_weights_output;
-    bool                                                                _are_weights_converted;
-    bool                                                                _are_weights_reshaped;
-    bool                                                                _is_fc_after_conv;
-    bool                                                                _is_quantized;
-    bool                                                                _is_prepared;
-    const ICLTensor                                                    *_original_weights;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLFULLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
index 9057440fc6..2e777273cd 100644
--- a/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
+++ b/arm_compute/runtime/CL/functions/CLFuseBatchNormalization.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,18 @@
 #ifndef ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H
 #define ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H
 
-#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
+class CLFuseBatchNormalizationKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to fuse the batch normalization node to a preceding convolution node */
 class CLFuseBatchNormalization : public IFunction
@@ -48,9 +52,19 @@ public:
     /** Allow instances of this class to be moved */
     CLFuseBatchNormalization &operator=(CLFuseBatchNormalization &&) = default;
     /** Default destructor */
-    ~CLFuseBatchNormalization() = default;
+    ~CLFuseBatchNormalization();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
+     *
      * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
      * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
      * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
@@ -64,9 +78,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -83,9 +104,17 @@ public:
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -103,16 +132,22 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLFuseBatchNormalizationKernel _fuse_bn_kernel;
+    std::unique_ptr<CLFuseBatchNormalizationKernel> _fuse_bn_kernel;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLFUSEBATCHNORMALIZATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index f5588112ae..f5e6aa1237 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,7 @@
 #ifndef ARM_COMPUTE_CLGEMM_H
 #define ARM_COMPUTE_CLGEMM_H
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -36,79 +32,16 @@
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+// Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref CLGEMMReshapeRHSMatrixKernel */
-class CLGEMMReshapeRHSMatrixKernelManaged : public ITransformWeights
-{
-public:
-    //Inherited method override
-    void run() override
-    {
-        _output.allocator()->allocate();
-        CLScheduler::get().enqueue(_kernel, false);
-        _reshape_run = true;
-    }
-
-    //Inherited method override
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    //Inherited method override
-    ICLTensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    //Inherited method override
-    uint32_t uid() override
-    {
-        return _uid;
-    }
-
-    /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
-     *
-     * @param[in] input Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[in] info  RHS matrix information to be used for reshaping.
-     */
-    void configure(const ICLTensor *input, GEMMRHSMatrixInfo info)
-    {
-        configure(CLKernelLibrary::get().get_compile_context(), input, info);
-    }
-
-    /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
-     * @param[in] info            RHS matrix information to be used for reshaping.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info)
-    {
-        _kernel.configure(compile_context, input, &_output, info);
-    }
-
-private:
-    static constexpr uint32_t    _uid = 0x15;
-    CLTensor                     _output{};
-    CLGEMMReshapeRHSMatrixKernel _kernel{};
-};
-} // namespace weights_transformations
-
-/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLGEMMReshapeLHSMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
- *  -# @ref CLGEMMReshapeRHSMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- *  -# @ref CLGEMMMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method())
- *  -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method())
- *  -# @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- *
- */
+/** Basic function to execute GEMM on OpenCL */
 class CLGEMM : public IFunction
 {
 public:
@@ -118,34 +51,26 @@ public:
      * @param[in] weights_manager (Optional) Weights manager.
      */
     CLGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    /** Default destructor */
+    ~CLGEMM();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMM(const CLGEMM &) = delete;
     /** Default move constructor */
-    CLGEMM(CLGEMM &&) = default;
+    CLGEMM(CLGEMM &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMM &operator=(const CLGEMM &) = delete;
     /** Default move assignment operator */
-    CLGEMM &operator=(CLGEMM &&) = default;
+    CLGEMM &operator=(CLGEMM &&);
     /** Initialise the kernel's inputs and output
      *
-     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     *
-     * @note All tensors must have the same data type.
+     * Valid data layouts:
+     * - All
      *
-     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
-     *
-     * @param[in]  a         First input tensor  (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
-     *                       in case matrix A and matrix B have been already transformed.
-     */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
-    /** Initialise the kernel's inputs and output
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
      *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
      *
@@ -153,6 +78,9 @@ public:
      *
      * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
      *
+     * @note Batched GEMM only allows RHS tensor's rank to be <= 3
+     * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  a               First input tensor  (Matrix or Vector A). Data types supported: F16/F32
      * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a.
@@ -161,57 +89,51 @@ public:
      * @param[in]  alpha           Weight of the matrix product
      * @param[in]  beta            Weight of matrix C
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
-     *                       in case matrix A and matrix B have been already transformed.
+     *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+     *                             in case matrix A and matrix B have been already transformed.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *a,
+                   const ICLTensor        *b,
+                   const ICLTensor        *c,
+                   ICLTensor              *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
+
+    /** Initialise the kernel's inputs and output
+     *
+     * Similar to @ref CLGEMM::configure()
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a,
+                   const ICLTensor *b,
+                   const ICLTensor *c,
+                   ICLTensor       *output,
+                   float            alpha,
+                   float            beta,
+                   const GEMMInfo  &gemm_info = GEMMInfo());
+
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMM.
      *
-     * @param[in] a         First input tensor info  (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a.
-     * @param[in] c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[in] output    Output tensor info. Data type supported: same as @p a
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of matrix C
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     * Similar to @ref CLGEMM::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    static CLGEMMKernelType select_gemm_kernel(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run);
-
-    void configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
-                                     const GEMMInfo &gemm_info);
-
-    static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    MemoryGroup                                                  _memory_group;
-    IWeightsManager                                             *_weights_manager;
-    CLGEMMMatrixMultiplyKernel                                   _mm_kernel;
-    CLGEMMReshapeLHSMatrixKernel                                 _reshape_lhs_kernel;
-    CLGEMMReshapeRHSMatrixKernel                                 _reshape_rhs_kernel;
-    weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged _reshape_rhs_kernel_managed;
-    CLGEMMMatrixMultiplyReshapedKernel                           _mm_reshaped_kernel;
-    CLGEMMMatrixMultiplyReshapedOnlyRHSKernel                    _mm_reshaped_only_rhs_kernel;
-    CLTensor                                                     _tmp_a;
-    CLTensor                                                     _tmp_b;
-    const ICLTensor                                             *_original_b;
-    bool                                                         _reshape_b_only_on_first_run;
-    bool                                                         _is_prepared;
-    CLGEMMKernelType                                             _gemm_kernel_type;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 6d1181eefe..70ceb1513b 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,146 +21,28 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/ITransformWeights.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
+class CLCompileContext;
 class ICLTensor;
-
-/** Function to reshape and transpose the weights. This function calls the following kernels:
- * -# @ref CLWeightsReshapeKernel
- */
-class CLConvolutionLayerReshapeWeights : public IFunction
-{
-public:
-    /** Constructor */
-    CLConvolutionLayerReshapeWeights();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                        Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
-     * @param[in]  biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] output     Destination tensor. Data types supported: Same as @p weights.
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
-     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] output          Destination tensor. Data types supported: Same as @p weights.
-     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights
-     *
-     * @param[in] weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/F16/F32.
-     * @param[in] biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output     Destination tensor. Data types supported: Same as @p weights.
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLWeightsReshapeKernel _weights_reshape_kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref CLConvolutionLayerReshapeWeights */
-class CLConvolutionLayerReshapeWeightsTransform : public ITransformWeights
-{
-public:
-    /** Configures the @ref CLConvolutionLayerReshapeWeights function
-     *
-     * @param[in] input      Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] biases     Biases tensor. Data type supported: Same as @p input.
-     * @param[in] num_groups Number of groups when performing a grouped convolution.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups)
-    {
-        configure(CLKernelLibrary::get().get_compile_context(), input, biases, num_groups);
-    }
-    /** Configures the @ref CLConvolutionLayerReshapeWeights function
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] biases          Biases tensor. Data type supported: Same as @p input.
-     * @param[in] num_groups      Number of groups when performing a grouped convolution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *biases, unsigned int num_groups)
-    {
-        _bias_bit   = (biases != nullptr) ? 1 : 0;
-        _num_groups = num_groups;
-        _func.configure(compile_context, input, biases, &_output, num_groups);
-    }
-
-    //Inherited method override
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    //Inherited method override
-    ICLTensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    //Inherited method override
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    //Inherited method override
-    uint32_t uid() override
-    {
-        return ((0x9) | (_bias_bit << 7) | (_num_groups << 8));
-    }
-
-private:
-    CLTensor                         _output{};
-    CLConvolutionLayerReshapeWeights _func{};
-    int32_t                          _bias_bit{ 0 };
-    unsigned int                     _num_groups{ 0 };
-};
-} // namespace weights_transformations
+class ITensorInfo;
 
 /** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
  *
- * -# @ref CLIm2ColKernel
- * -# @ref CLGEMM (if the data type is FP32 or FP16)
- * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8)
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8)
- * -# @ref CLCol2ImKernel (if NCHW data layout)
+ * -# @ref opencl::ClGemmConv2d
  */
 class CLGEMMConvolutionLayer : public IFunction
 {
@@ -170,7 +52,8 @@ public:
      * @param[in] memory_manager  (Optional) Memory manager.
      * @param[in] weights_manager (Optional) Weights manager.
      */
-    CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                           IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMMConvolutionLayer(const CLGEMMConvolutionLayer &) = delete;
     /** Default move constructor */
@@ -179,15 +62,31 @@ public:
     CLGEMMConvolutionLayer &operator=(const CLGEMMConvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLGEMMConvolutionLayer &operator=(CLGEMMConvolutionLayer &&) = default;
+    /**Default destructor */
+    ~CLGEMMConvolutionLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/F16/F32.
+     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                          Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
@@ -197,18 +96,25 @@ public:
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    void configure(const ICLTensor           *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info = WeightsInfo(),
+                   const Size2D              &dilation     = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                   unsigned int               num_groups   = 1);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: QASYMM8/F16/F32.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                             Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
      * @param[out] output          Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
@@ -218,18 +124,25 @@ public:
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info = WeightsInfo(),
+                   const Size2D              &dilation     = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                   unsigned int               num_groups   = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/F16/F32.
+     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     *                          Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                          Data types supported: Same as @p input.
      * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
@@ -241,70 +154,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info = WeightsInfo(),
+                           const Size2D              &dilation     = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info     = ActivationLayerInfo(),
+                           unsigned int               num_groups   = 1);
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    /** Configures the appropriate matrix multiply routine
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in]      input                 Input tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in]      weights               Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in]      biases                Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                                       Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[in, out] output                Output tensor. Data types supported: Same as @p input,
-     *                                       except for input of QASYMM8 type where output should be of S32 type.
-     * @param[in]      gemmlowp_output_stage GEMMLowp output stage info
-     * @param[in]      gemm_3d_depth         Depth of GEMM 3D
-     * @param[in]      act_info              Activation to apply after the matrix multiplication
-     */
-    void configure_mm(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                      const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
-     *
-     * @param[in] input                 Input tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
-     * @param[in] biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                                  Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[in] output                Output tensor info. Data types supported: Same as @p input,
-     *                                  except for input of QASYMM8 type where output should be of S32 type.
-     * @param[in] gemmlowp_output_stage GEMMLowp output stage info
-     * @param[in] gemm_3d_depth         Depth of GEMM 3D
-     * @param[in] skip_im2col           Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
-     * @param[in] act_info              Activation to apply after the matrix multiplication
-     *
-     * @return a status
-     */
-    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
-
-private:
-    MemoryGroup                                                        _memory_group;
-    IWeightsManager                                                   *_weights_manager;
-    CLConvolutionLayerReshapeWeights                                   _reshape_weights;
-    weights_transformations::CLConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
-    CLIm2ColKernel                                                     _im2col_kernel;
-    CLGEMM                                                             _mm_gemm;
-    CLGEMMLowpMatrixMultiplyCore                                       _mm_gemmlowp;
-    CLCol2ImKernel                                                     _col2im_kernel;
-    CLActivationLayer                                                  _activationlayer_function;
-
-    const ICLTensor *_original_weights;
-
-    CLTensor _im2col_output;
-    CLTensor _weights_reshaped;
-    CLTensor _gemm_output;
-
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _fuse_activation;
-    bool _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLGEMMCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLGEMMCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
index d8710a461f..3e8929c5ad 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLGEMMDECONVOLUTIONLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
@@ -40,6 +41,7 @@
 
 namespace arm_compute
 {
+class CLDeconvolutionReshapeOutputKernel;
 class ICLTensor;
 /** Function to run the deconvolution layer through a call to GEMM.
  *
@@ -89,8 +91,21 @@ public:
     CLGEMMDeconvolutionLayer &operator=(const CLGEMMDeconvolutionLayer &) = delete;
     /** Default move assignment operator */
     CLGEMMDeconvolutionLayer &operator=(CLGEMMDeconvolutionLayer &&) = default;
+    /** Default desctructor */
+    ~CLGEMMDeconvolutionLayer();
     /** Set the input, weights, biases and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     *
      * @param[in,out] input       Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
      *                            Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. Data layout supported: NHWC
      * @param[in]     weights     The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input. Data layout supported: same as @p input.
@@ -98,7 +113,11 @@ public:
      * @param[out]    output      Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
      * @param[in]     deconv_info Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const PadStrideInfo &deconv_info);
     /** Set the input, weights, biases and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -109,7 +128,12 @@ public:
      * @param[out]    output          Output tensor. The output has the same number of dimensions as the @p input. Data layout supported: same as @p input.
      * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This function supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayer
      *
      * @param[in] input       Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
@@ -121,7 +145,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run() override;
@@ -130,15 +158,15 @@ public:
 private:
     MemoryGroup _memory_group;
 
-    CLGEMM                             _mm_gemm;
-    CLGEMMLowpMatrixMultiplyCore       _mm_gemmlowp;
-    CLGEMMLowpOutputStage              _gemmlowp_output_stage;
-    CLPermute                          _permute_input_to_nhwc;
-    CLPermute                          _permute_weights_to_nhwc;
-    CLReshapeLayer                     _reshape_weights;
-    CLTranspose                        _transpose_weights;
-    CLDeconvolutionReshapeOutputKernel _deconv_reshape;
-    CLSlice                            _slice_gemm;
+    CLGEMM                                              _mm_gemm;
+    CLGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
+    CLGEMMLowpOutputStage                               _gemmlowp_output_stage;
+    CLPermute                                           _permute_input_to_nhwc;
+    CLPermute                                           _permute_weights_to_nhwc;
+    CLReshapeLayer                                      _reshape_weights;
+    CLTranspose                                         _transpose_weights;
+    std::unique_ptr<CLDeconvolutionReshapeOutputKernel> _deconv_reshape;
+    CLSlice                                             _slice_gemm;
 
     CLTensor _gemmlowp_final;
     CLTensor _reshaped_weights;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 6ac3cefb76..1b8e5dcc1d 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,19 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
 class IMemoryManager;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
 class CLGEMMLowpMatrixMultiplyCore : public IFunction
@@ -54,13 +52,35 @@ public:
     CLGEMMLowpMatrixMultiplyCore &operator=(const CLGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move assignment operator */
     CLGEMMLowpMatrixMultiplyCore &operator=(CLGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Default destructor */
+    ~CLGEMMLowpMatrixMultiplyCore();
     /** Initialise the kernel's inputs, output
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     *
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
      *  This kernel performs the following computations:
      *
-     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-     *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
+     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
      *
@@ -71,14 +91,18 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ICLTensor *a,
+                   const ICLTensor *b,
+                   const ICLTensor *c,
+                   ICLTensor       *output,
+                   const GEMMInfo  &gemm_info = GEMMInfo());
     /** Initialise the kernel's inputs, output
      *
      * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
      *  This kernel performs the following computations:
      *
-     *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-     *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
+     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
      *
@@ -90,7 +114,12 @@ public:
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *a,
+                   const ICLTensor        *b,
+                   const ICLTensor        *c,
+                   ICLTensor              *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
      *
      * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
@@ -102,47 +131,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup _memory_group;
-
-    // Kernels used
-    CLDepthConvertLayerKernel                     _weights_to_qasymm8;
-    CLGEMMLowpMatrixMultiplyNativeKernel          _mm_native_kernel;
-    CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
-    CLGEMMReshapeRHSMatrixKernel                  _mtx_b_reshape_kernel;
-    CLGEMMLowpMatrixAReductionKernel              _mtx_a_reduction_kernel;
-    CLGEMMLowpMatrixBReductionKernel              _mtx_b_reduction_kernel;
-    CLGEMMLowpOffsetContributionKernel            _offset_contribution_kernel;
-    CLGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-
-    // Temporary tensors
-    CLTensor _qasymm8_weights;
-    CLTensor _vector_sum_col;
-    CLTensor _vector_sum_row;
-    CLTensor _tmp_b;
-    CLTensor _mm_result_s32;
-    CLTensor _gemm_output_stage_multipliers;
-    CLTensor _gemm_output_stage_shifts;
-
-    // Tensor pointers
-    const ICLTensor *_matrix_a;
-    const ICLTensor *_original_b;
-    const ICLTensor *_output;
-
-    int32_t _a_offset;
-    int32_t _b_offset;
-    bool    _is_gemm_reshaped;
-    bool    _reshape_b_only_on_first_run;
-    bool    _is_prepared;
-    bool    _run_output_stage;
-    bool    _convert_to_qasymm8;
-    bool    _run_offset_contribution;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 06cb759b16..ff9c872896 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,11 @@
 #ifndef ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H
 #define ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
+#include <limits>
+
 /** This file contains all available output stages for GEMMLowp on OpenCL.
  *
  *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
@@ -36,389 +39,54 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ITensor;
+class ICLTensor;
+class ITensorInfo;
+struct GEMMLowpOutputStageInfo;
 
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8Scale on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
- *  The final result is:
- *
- *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * In case the bias tensor is provided, the final result is:
- *
- *  ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8Scale : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_offset   Offset to be added to each element of the input matrix
-     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_offset   Offset to be added to each element of the input matrix
-     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_offset, int result_mult_int, int result_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8Scale
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                   int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                   int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED. Defaults to 0
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat on OpenCL.
- *
- *  This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input      Input tensor. Data type supported: S32
-     * @param[in]  bias       Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                        Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output     Output tensor. Data type supported: QASYMM8
-     * @param[in]  multiplier Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset     Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min        (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max        (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: QASYMM8
-     * @param[in]  multiplier      Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset          Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset,
-                   int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on OpenCL.
- *
- *  CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  compile_context              The compile context to be used.
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, int result_fixedpoint_multiplier, int result_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor info. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
 /** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
  *
  *  This function calls the following CL kernels:
  *
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
 */
-class CLGEMMLowpOutputStage : public ICLSimpleFunction
+class CLGEMMLowpOutputStage : public IFunction
 {
 public:
+    CLGEMMLowpOutputStage();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpOutputStage(const CLGEMMLowpOutputStage &) = delete;
+    /** Default move constructor */
+    CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLGEMMLowpOutputStage &operator=(const CLGEMMLowpOutputStage &) = delete;
+    /** Default move assignment operator */
+    CLGEMMLowpOutputStage &operator=(CLGEMMLowpOutputStage &&);
+    /** Default destructor */
+    ~CLGEMMLowpOutputStage();
     /** Initialise the kernel's inputs, output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
+     *
      * @param[in]  input  Input tensor. Data type supported: S32
      * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
      *                    Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[out] output Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16
      * @param[in]  info   GEMMLowp output stage metadata.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
     /** Initialise the kernel's inputs, output
      *
      * @param[in]  compile_context The compile context to be used.
@@ -428,8 +96,12 @@ public:
      * @param[out] output          Output tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    void configure(const CLCompileContext        &compile_context,
+                   const ICLTensor               *input,
+                   const ICLTensor               *bias,
+                   ICLTensor                     *output,
+                   const GEMMLowpOutputStageInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
      *
      * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
      * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
@@ -439,7 +111,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLGather.h b/arm_compute/runtime/CL/functions/CLGather.h
index dcd9efc6e0..360c8757b6 100644
--- a/arm_compute/runtime/CL/functions/CLGather.h
+++ b/arm_compute/runtime/CL/functions/CLGather.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,11 +25,14 @@
 #ifndef ARM_COMPUTE_CLGATHER_H
 #define ARM_COMPUTE_CLGATHER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLGatherKernel */
 class CLGather : public ICLSimpleFunction
@@ -37,8 +40,16 @@ class CLGather : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs and outputs
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
      * @param[out] output  Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
      */
@@ -47,22 +58,27 @@ public:
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in]  indices         Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+     * @param[in]  indices         Indices tensor. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
      * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All.
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis])
+     * @param[in] indices Indices tensor info. Supported tensor rank: up to 4. Must be one of the following types: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
      * @param[in] output  Destination tensor info. Data type supported: Same as @p input
      * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLGATHER_H */
diff --git a/arm_compute/runtime/CL/functions/CLGaussian3x3.h b/arm_compute/runtime/CL/functions/CLGaussian3x3.h
deleted file mode 100644
index f1906cde92..0000000000
--- a/arm_compute/runtime/CL/functions/CLGaussian3x3.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN3X3_H
-#define ARM_COMPUTE_CLGAUSSIAN3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute gaussian filter 3x3. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussian3x3Kernel
- *
- */
-class CLGaussian3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLGAUSSIAN3X3_H */
diff --git a/arm_compute/runtime/CL/functions/CLGaussian5x5.h b/arm_compute/runtime/CL/functions/CLGaussian5x5.h
deleted file mode 100644
index d4ed772342..0000000000
--- a/arm_compute/runtime/CL/functions/CLGaussian5x5.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIAN5X5_H
-#define ARM_COMPUTE_CLGAUSSIAN5X5_H
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLGaussian5x5Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute gaussian filter 5x5. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussian5x5HorKernel
- * -# @ref CLGaussian5x5VertKernel
- *
- */
-class CLGaussian5x5 : public IFunction
-{
-public:
-    /** Default Constructor.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    CLGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup             _memory_group;   /**< Function's memory group */
-    CLGaussian5x5HorKernel  _kernel_hor;     /**< Horizontal pass kernel */
-    CLGaussian5x5VertKernel _kernel_vert;    /**< Vertical pass kernel */
-    CLFillBorderKernel      _border_handler; /**< Kernel to handle image borders */
-    CLImage                 _tmp;            /**< Temporary buffer */
-};
-}
-#endif /*ARM_COMPUTE_CLGAUSSIAN5X5_H */
diff --git a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h b/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
deleted file mode 100644
index a75a4d1028..0000000000
--- a/arm_compute/runtime/CL/functions/CLGaussianPyramid.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGAUSSIANPYRAMID_H
-#define ARM_COMPUTE_CLGAUSSIANPYRAMID_H
-
-#include "arm_compute/core/CL/kernels/CLGaussianPyramidKernel.h"
-
-#include "arm_compute/core/CL/kernels/CLScaleKernel.h"
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Common interface for all Gaussian pyramid functions */
-class CLGaussianPyramid : public IFunction
-{
-public:
-    /** Constructor */
-    CLGaussianPyramid();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramid(const CLGaussianPyramid &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGaussianPyramid &operator=(const CLGaussianPyramid &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramid(CLGaussianPyramid &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGaussianPyramid &operator=(CLGaussianPyramid &&) = default;
-    /** Default destructor */
-    virtual ~CLGaussianPyramid() = default;
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     pyramid               Destination pyramid tensors, Data types supported at each level: U8.
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    virtual void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0;
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in, out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     pyramid               Destination pyramid tensors, Data types supported at each level: U8.
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    virtual void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value = 0) = 0;
-
-protected:
-    ICLTensor *_input;
-    CLPyramid *_pyramid;
-    CLPyramid  _tmp;
-};
-
-/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussianPyramidHorKernel
- * -# @ref CLGaussianPyramidVertKernel
- */
-class CLGaussianPyramidHalf : public CLGaussianPyramid
-{
-public:
-    /** Constructor */
-    CLGaussianPyramidHalf();
-
-    // Inherited methods overridden:
-    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void run() override;
-
-private:
-    std::vector<CLFillBorderKernel>          _horizontal_border_handler;
-    std::vector<CLFillBorderKernel>          _vertical_border_handler;
-    std::vector<CLGaussianPyramidHorKernel>  _horizontal_reduction;
-    std::vector<CLGaussianPyramidVertKernel> _vertical_reduction;
-};
-
-/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLGaussian5x5
- * -# @ref CLScaleKernel
- */
-class CLGaussianPyramidOrb : public CLGaussianPyramid
-{
-public:
-    /** Constructor */
-    CLGaussianPyramidOrb();
-
-    // Inherited methods overridden:
-    void configure(ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void run() override;
-
-private:
-    std::vector<CLGaussian5x5> _gauss5x5;
-    std::vector<CLScaleKernel> _scale_nearest;
-};
-}
-#endif /*ARM_COMPUTE_CLGAUSSIANPYRAMID_H */
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index 91b30fabcb..3a201e79b0 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,35 +23,40 @@
  */
 #ifndef ARM_COMPUTE_CLGENERATEPROPOSALSLAYER_H
 #define ARM_COMPUTE_CLGENERATEPROPOSALSLAYER_H
-#include "arm_compute/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
-#include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
+class CLBoundingBoxTransformKernel;
+class CLDequantizationLayer;
+class CLComputeAllAnchorsKernel;
+class CLPadLayerKernel;
+class CLQuantizationLayer;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to generate proposals for a RPN (Region Proposal Network)
  *
  * This function calls the following OpenCL kernels:
- * -# @ref CLComputeAllAnchors
+ * -# @ref CLComputeAllAnchorsKernel
  * -# @ref CLPermute x 2
  * -# @ref CLReshapeLayer x 2
  * -# @ref CLBoundingBoxTransform
  * -# @ref CLPadLayerKernel
- * -# @ref CLDequantizationLayerKernel x 2
- * -# @ref CLQuantizationLayerKernel
+ * -# @ref CLDequantizationLayer x 2
+ * -# @ref CLQuantizationLayer
  * And the following CPP functions:
  * -# @ref CPPBoxWithNonMaximaSuppressionLimit
  */
@@ -67,9 +72,21 @@ public:
     CLGenerateProposalsLayer(const CLGenerateProposalsLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGenerateProposalsLayer &operator=(const CLGenerateProposalsLayer &) = delete;
+    /** Default destructor */
+    ~CLGenerateProposalsLayer();
 
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QSYMM8             |QSYMM16  |QASYMM8        |
+     *
      * @param[in]  scores              Scores from convolution layer of size (W, H, A), where H and W are the height and width of the feature map, and A is the number of anchors.
      *                                 Data types supported: QASYMM8/F16/F32
      * @param[in]  deltas              Bounding box deltas from convolution layer of size (W, H, 4*A). Data types supported: Same as @p scores
@@ -83,7 +100,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+    void configure(const ICLTensor             *scores,
+                   const ICLTensor             *deltas,
+                   const ICLTensor             *anchors,
+                   ICLTensor                   *proposals,
+                   ICLTensor                   *scores_out,
+                   ICLTensor                   *num_valid_proposals,
                    const GenerateProposalsInfo &info);
     /** Set the input and output tensors.
      *
@@ -101,8 +123,14 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out,
-                   ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info);
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *scores,
+                   const ICLTensor             *deltas,
+                   const ICLTensor             *anchors,
+                   ICLTensor                   *proposals,
+                   ICLTensor                   *scores_out,
+                   ICLTensor                   *num_valid_proposals,
+                   const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGenerateProposalsLayer
      *
@@ -118,7 +146,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
@@ -130,16 +162,16 @@ private:
     MemoryGroup _memory_group;
 
     // OpenCL kernels
-    CLPermuteKernel              _permute_deltas_kernel;
-    CLReshapeLayerKernel         _flatten_deltas_kernel;
-    CLPermuteKernel              _permute_scores_kernel;
-    CLReshapeLayerKernel         _flatten_scores_kernel;
-    CLComputeAllAnchorsKernel    _compute_anchors_kernel;
-    CLBoundingBoxTransformKernel _bounding_box_kernel;
-    CLPadLayerKernel             _pad_kernel;
-    CLDequantizationLayerKernel  _dequantize_anchors;
-    CLDequantizationLayerKernel  _dequantize_deltas;
-    CLQuantizationLayerKernel    _quantize_all_proposals;
+    CLPermute                                     _permute_deltas;
+    CLReshapeLayer                                _flatten_deltas;
+    CLPermute                                     _permute_scores;
+    CLReshapeLayer                                _flatten_scores;
+    std::unique_ptr<CLComputeAllAnchorsKernel>    _compute_anchors_kernel;
+    std::unique_ptr<CLBoundingBoxTransformKernel> _bounding_box_kernel;
+    std::unique_ptr<CLPadLayerKernel>             _pad_kernel;
+    std::unique_ptr<CLDequantizationLayer>        _dequantize_anchors;
+    std::unique_ptr<CLDequantizationLayer>        _dequantize_deltas;
+    std::unique_ptr<CLQuantizationLayer>          _quantize_all_proposals;
 
     // CPP functions
     CPPBoxWithNonMaximaSuppressionLimit _cpp_nms;
diff --git a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h b/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
deleted file mode 100644
index 71280c898a..0000000000
--- a/arm_compute/runtime/CL/functions/CLHOGDescriptor.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDESCRIPTOR_H
-#define ARM_COMPUTE_CLHOGDESCRIPTOR_H
-
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class IHOG;
-/** Basic function to calculate HOG descriptor. This function calls the following OpenCL kernels:
- *
- * -# @ref CLHOGGradient
- * -# @ref CLHOGOrientationBinningKernel
- * -# @ref CLHOGBlockNormalizationKernel
- *
- */
-class CLHOGDescriptor : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destination, HOG data-object and border mode
-     *
-     * @param[in, out] input                 Input tensor. Data type supported: U8
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
-     * @param[in]      hog                   HOG data object which describes the HOG descriptor
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destination, HOG data-object and border mode
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in, out] input                 Input tensor. Data type supported: U8
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
-     * @param[in]      hog                   HOG data object which describes the HOG descriptor
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                   _memory_group;
-    CLHOGGradient                 _gradient;
-    CLHOGOrientationBinningKernel _orient_bin;
-    CLHOGBlockNormalizationKernel _block_norm;
-    CLTensor                      _mag;
-    CLTensor                      _phase;
-    CLTensor                      _hog_space;
-};
-}
-
-#endif /* ARM_COMPUTE_CLHOGDESCRIPTOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLHOGDetector.h b/arm_compute/runtime/CL/functions/CLHOGDetector.h
deleted file mode 100644
index c2bdc15b35..0000000000
--- a/arm_compute/runtime/CL/functions/CLHOGDetector.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGDETECTOR_H
-#define ARM_COMPUTE_CLHOGDETECTOR_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLHOGDetectorKernel.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-/** Basic function to execute HOG detector based on linear SVM. This function calls the following OpenCL kernel:
- *
- * -# @ref CLHOGDetectorKernel
- *
- */
-class CLHOGDetector : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGDetector();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetector(const CLHOGDetector &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGDetector &operator=(const CLHOGDetector &) = delete;
-    /** Allow instances of this class to be moved */
-    CLHOGDetector(CLHOGDetector &&) = default;
-    /** Allow instances of this class to be moved */
-    CLHOGDetector &operator=(CLHOGDetector &&) = default;
-    /** Default destructor */
-    ~CLHOGDetector() = default;
-    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
-     *
-     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
-     *
-     * @param[in]  input                   Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32
-     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
-     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the block stride stored in hog
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
-    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
-     *
-     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
-     *
-     * @param[in]  compile_context         The compile context to be used.
-     * @param[in]  input                   Input tensor. It is the output of @ref CLHOGDescriptor. Data type supported: F32
-     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
-     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the block stride stored in hog
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLHOG *hog, ICLDetectionWindowArray *detection_windows, const Size2D &detection_window_stride,
-                   float threshold = 0.0f,
-                   size_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLHOGDetectorKernel      _hog_detector_kernel;
-    ICLDetectionWindowArray *_detection_windows;
-    cl::Buffer               _num_detection_windows;
-};
-}
-
-#endif /* ARM_COMPUTE_CLHOGDETECTOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLHOGGradient.h b/arm_compute/runtime/CL/functions/CLHOGGradient.h
deleted file mode 100644
index 450a4a6045..0000000000
--- a/arm_compute/runtime/CL/functions/CLHOGGradient.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGGRADIENT_H
-#define ARM_COMPUTE_CLHOGGRADIENT_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-#include "arm_compute/core/CL/kernels/CLMagnitudePhaseKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLDerivative.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic function to calculate the gradient for HOG. This function calls the following OpenCL kernels:
- *
- * -# @ref CLDerivative
- * -# @ref CLMagnitudePhaseKernel
- *
- */
-class CLHOGGradient : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations, phase type and border mode
-     *
-     * @param[in, out] input                 Input tensor. Data type supported: U8.
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
-     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
-     * @param[in]      phase_type            Type of @ref PhaseType
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations, phase type and border mode
-     *
-     * @param[in]      compile_context       The compile context to be used.
-     * @param[in, out] input                 Input tensor. Data type supported: U8.
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
-     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
-     * @param[in]      phase_type            Type of @ref PhaseType
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_magnitude, ICLTensor *output_phase, PhaseType phase_type, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup            _memory_group;
-    CLDerivative           _derivative;
-    CLMagnitudePhaseKernel _mag_phase;
-    CLTensor               _gx;
-    CLTensor               _gy;
-};
-}
-#endif /*ARM_COMPUTE_CLHOGGRADIENT_H */
diff --git a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h b/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
deleted file mode 100644
index 3d22ff69ee..0000000000
--- a/arm_compute/runtime/CL/functions/CLHOGMultiDetection.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHOGMULTIDETECTION_H
-#define ARM_COMPUTE_CLHOGMULTIDETECTION_H
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/ICLMultiHOG.h"
-#include "arm_compute/core/CL/kernels/CLHOGDescriptorKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLHOGDetector.h"
-#include "arm_compute/runtime/CL/functions/CLHOGGradient.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following kernels:
- *
- * -# @ref CLHOGGradient
- * -# @ref CLHOGOrientationBinningKernel
- * -# @ref CLHOGBlockNormalizationKernel
- * -# @ref CLHOGDetector
- * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
- *
- * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
- *       -# Phase type
-         -# Normalization type
-         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
- *
- */
-class CLHOGMultiDetection : public IFunction
-{
-public:
-    /** Default constructor */
-    CLHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGMultiDetection(const CLHOGMultiDetection &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHOGMultiDetection &operator=(const CLHOGMultiDetection &) = delete;
-    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
-     *
-     * @param[in, out] input                    Input tensor. Data type supported: U8
-     *                                          (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
-     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
-     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
-     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
-     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
-     *                                          The dimension of this array must be the same of multi_hog->num_models()
-     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
-     * @param[in]      border_mode              Border mode to use.
-     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
-     *                                          True if the non-maxima suppression stage has to be computed
-     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
-     *
-     */
-    void configure(ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides, BorderMode border_mode,
-                   uint8_t constant_border_value = 0, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
-    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
-     *
-     * @param[in]      compile_context          The compile context to be used.
-     * @param[in, out] input                    Input tensor. Data type supported: U8
-     *                                          (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
-     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
-     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
-     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
-     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
-     *                                          The dimension of this array must be the same of multi_hog->num_models()
-     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
-     * @param[in]      border_mode              Border mode to use.
-     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
-     *                                          True if the non-maxima suppression stage has to be computed
-     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLMultiHOG *multi_hog, ICLDetectionWindowArray *detection_windows, ICLSize2DArray *detection_window_strides,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                  _memory_group;
-    CLHOGGradient                                _gradient_kernel;
-    std::vector<CLHOGOrientationBinningKernel>   _orient_bin_kernel;
-    std::vector<CLHOGBlockNormalizationKernel>   _block_norm_kernel;
-    std::vector<CLHOGDetector>                   _hog_detect_kernel;
-    CPPDetectionWindowNonMaximaSuppressionKernel _non_maxima_kernel;
-    std::vector<CLTensor>                        _hog_space;
-    std::vector<CLTensor>                        _hog_norm_space;
-    ICLDetectionWindowArray                     *_detection_windows;
-    CLTensor                                     _mag;
-    CLTensor                                     _phase;
-    bool                                         _non_maxima_suppression;
-    size_t                                       _num_orient_bin_kernel;
-    size_t                                       _num_block_norm_kernel;
-    size_t                                       _num_hog_detect_kernel;
-};
-}
-
-#endif /* ARM_COMPUTE_CLHOGMULTIDETECTION_H */
-\ No newline at end of file
diff --git a/arm_compute/runtime/CL/functions/CLHarrisCorners.h b/arm_compute/runtime/CL/functions/CLHarrisCorners.h
deleted file mode 100644
index 2d0e78b00e..0000000000
--- a/arm_compute/runtime/CL/functions/CLHarrisCorners.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHARRISCORNERS_H
-#define ARM_COMPUTE_CLHARRISCORNERS_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLHarrisCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute harris corners detection. This function calls the following CL and NEON kernels and functions:
- *
- * @note Requires CPU support for the kernels: CPPCornerCandidatesKernel and CPPSortEuclideanDistanceKernel.
- *
- * -# @ref CLSobel3x3 (if gradient_size == 3) or<br/>
- *    @ref CLSobel5x5 (if gradient_size == 5) or<br/>
- *    @ref CLSobel7x7 (if gradient_size == 7)
- * -# @ref CLFillBorderKernel
- * -# @ref CLHarrisScoreKernel
- * -# @ref CLNonMaximaSuppression3x3
- * -# @ref CPPCornerCandidatesKernel
- * -# @ref CPPSortEuclideanDistanceKernel
- */
-class CLHarrisCorners : public IFunction
-{
-public:
-    /** Constructor */
-    CLHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHarrisCorners(const CLHarrisCorners &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const CLHarrisCorners &operator=(const CLHarrisCorners &) = delete;
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]     min_dist              Radial Euclidean distance for the euclidean distance stage.
-     * @param[in]     sensitivity           Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]     gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
-     * @param[in]     block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     */
-    void configure(ICLImage *input, float threshold, float min_dist, float sensitivity,
-                   int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source image. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]     min_dist              Radial Euclidean distance for the euclidean distance stage.
-     * @param[in]     sensitivity           Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]     gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
-     * @param[in]     block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
-     * @param[out]    corners               Array of keypoints to store the results.
-     * @param[in]     border_mode           Border mode to use
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     */
-    void configure(const CLCompileContext &compile_context, ICLImage *input, float threshold, float min_dist, float sensitivity,
-                   int32_t gradient_size, int32_t block_size, ICLKeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                    _memory_group;          /**< Function's memory group */
-    std::unique_ptr<IFunction>     _sobel;                 /**< Sobel function */
-    CLHarrisScoreKernel            _harris_score;          /**< Harris score kernel */
-    CLNonMaximaSuppression3x3      _non_max_suppr;         /**< Non-maxima suppression function */
-    CPPCornerCandidatesKernel      _candidates;            /**< Sort kernel */
-    CPPSortEuclideanDistanceKernel _sort_euclidean;        /**< Euclidean distance kernel */
-    CLFillBorderKernel             _border_gx;             /**< Border handler before running harris score */
-    CLFillBorderKernel             _border_gy;             /**< Border handler before running harris score */
-    CLImage                        _gx;                    /**< Source image - Gx component */
-    CLImage                        _gy;                    /**< Source image - Gy component */
-    CLImage                        _score;                 /**< Source image - Harris score */
-    CLImage                        _nonmax;                /**< Source image - Non-Maxima suppressed image */
-    std::vector<InternalKeypoint>  _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
-    int32_t                        _num_corner_candidates; /**< Number of potential corner candidates */
-    ICLKeyPointArray              *_corners;               /**< Output corners array */
-};
-}
-#endif /*ARM_COMPUTE_CLHARRISCORNERS_H */
diff --git a/arm_compute/runtime/CL/functions/CLHistogram.h b/arm_compute/runtime/CL/functions/CLHistogram.h
deleted file mode 100644
index 6d34dd7060..0000000000
--- a/arm_compute/runtime/CL/functions/CLHistogram.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLHISTOGRAM_H
-#define ARM_COMPUTE_CLHISTOGRAM_H
-
-#include "arm_compute/core/CL/kernels/CLHistogramKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class ICLDistribution1D;
-class ICLTensor;
-
-/** Basic function to execute histogram. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLHistogramKernel
- *  -# @ref CLHistogramBorderKernel
- *
- */
-class CLHistogram : public IFunction
-{
-public:
-    /*
-     * @ Default constructor
-     */
-    CLHistogram();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLHistogram(const CLHistogram &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    const CLHistogram &operator=(const CLHistogram &) = delete;
-    /** Initialize the function
-     *
-     * @param[in]  input  Source image. Data types supported: U8
-     * @param[out] output Output distribution.
-     */
-    void configure(const ICLImage *input, ICLDistribution1D *output);
-    /** Initialize the function
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source image. Data types supported: U8
-     * @param[out] output          Output distribution.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, ICLDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLHistogramKernel       _kernel;        /**< kernel to run */
-    CLHistogramBorderKernel _kernel_border; /**< Border kernel to run */
-};
-}
-#endif /*ARM_COMPUTE_CLHISTOGRAM_H */
diff --git a/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
new file mode 100644
index 0000000000..91952af5dc
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLIndirectConvolutionLayer.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H
+#define ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run the indirect convolution function
+ */
+class CLIndirectConvolutionLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLIndirectConvolutionLayer();
+    /** Destructor */
+    ~CLIndirectConvolutionLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIndirectConvolutionLayer(const CLIndirectConvolutionLayer &) = delete;
+    /** Default move constructor */
+    CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLIndirectConvolutionLayer &operator=(const CLIndirectConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    CLIndirectConvolutionLayer &operator=(CLIndirectConvolutionLayer &&);
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     *
+     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input,
+     *                       while every optional dimension from 4 and above represent a batch of inputs.
+     *                       Data types supported: F16/F32.
+     * @param[in]  weights   Weights tensor. Weights are 4D tensor. Data type supported:Same as @p input.
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                       Data type supported: Should match @p input data type.
+     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output, while the rest represent batch of outputs.
+     *                       Data types supported: Same as @p input.
+     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. 3 lower dimensions represent a single input,
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p input.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p input data type.
+     * @param[out] output          Destination tensor. 3 lower dimensions represent a single output, while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLIndirectConvolutionLayer
+     *
+     * @param[in] input     Source tensor. 3 lower dimensions represent a single input,
+     *                      while every optional dimension from 4 and above represent a batch of inputs.
+     *                      Data types supported: F16/F32.
+     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p input.
+     * @param[in] biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                      Data type supported: Should match @p input data type.
+     * @param[in] output    Destination tensor. 3 lower dimensions represent a single output, while the rest represent batch of outputs.
+     *                      Data types supported: Same as @p input.
+     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLINDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
index 4614b90c70..98d215dd4b 100644
--- a/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,57 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
+class ICLKernel;
+class CLRuntimeContext;
 
 /** Basic function to perform a Instance normalization.
  *
  * This function runs the following kernels:
  * -# @ref CLInstanceNormalizationLayerKernel
  */
-class CLInstanceNormalizationLayer : public ICLSimpleFunction
+class CLInstanceNormalizationLayer : public IFunction
 {
 public:
-    /** Default constructor */
-    CLInstanceNormalizationLayer();
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    CLInstanceNormalizationLayer(CLRuntimeContext *ctx = nullptr);
+
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLInstanceNormalizationLayer(const CLInstanceNormalizationLayer &) = delete;
+    /** Default move constructor */
+    CLInstanceNormalizationLayer(CLInstanceNormalizationLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLInstanceNormalizationLayer &operator=(const CLInstanceNormalizationLayer &) = delete;
+    /** Default move assignment operator */
+    CLInstanceNormalizationLayer &operator=(CLInstanceNormalizationLayer &&) = default;
+    /** Default destructor */
+    ~CLInstanceNormalizationLayer();
+
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
+     *
      * @param[in, out] input               Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
      *                                     Data types supported: F16/F32. Data layout supported: NHWC, NCHW
      * @param[out]     output              Destination tensor. Data types and data layouts supported: same as @p input.
@@ -50,7 +83,12 @@ public:
      * @param[in]      epsilon             (Optional) Lower bound value for the normalization. Defaults to 1e-12
      * @param[in]      use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
      */
-    void configure(ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    void configure(ICLTensor *input,
+                   ICLTensor *output,
+                   float      gamma               = 1.0f,
+                   float      beta                = 0.0f,
+                   float      epsilon             = 1e-12f,
+                   bool       use_mixed_precision = true);
     /** Set the input and output tensors.
      *
      * @param[in]      compile_context     The compile context to be used.
@@ -62,7 +100,13 @@ public:
      * @param[in]      epsilon             (Optional) Lower bound value for the normalization. Defaults to 1e-12
      * @param[in]      use_mixed_precision (Optional) Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   float                   gamma               = 1.0f,
+                   float                   beta                = 0.0f,
+                   float                   epsilon             = 1e-12f,
+                   bool                    use_mixed_precision = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -75,7 +119,19 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f, bool use_mixed_precision = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma               = 1.0f,
+                           float              beta                = 0.0f,
+                           float              epsilon             = 1e-12f,
+                           bool               use_mixed_precision = true);
+    void          run() override;
+
+private:
+    std::unique_ptr<ICLKernel> _inst_norm_kernel; /**< Kernel to run */
+    std::unique_ptr<ICLKernel> _mean_var_kernel;  /**< Kernel to run */
+    CLTensor                   _mean_var_tensor;
+    CLRuntimeContext          *_ctx; /**< Context to use */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLIntegralImage.h b/arm_compute/runtime/CL/functions/CLIntegralImage.h
deleted file mode 100644
index 1ea189bf33..0000000000
--- a/arm_compute/runtime/CL/functions/CLIntegralImage.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLINTEGRALIMAGE_H
-#define ARM_COMPUTE_CLINTEGRALIMAGE_H
-
-#include "arm_compute/core/CL/kernels/CLIntegralImageKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute integral image. This function calls the following OpenCL kernels:
- *
- * -# @ref CLIntegralImageHorKernel
- * -# @ref CLIntegralImageVertKernel
- *
- */
-class CLIntegralImage : public IFunction
-{
-public:
-    /** Default Constructor. */
-    CLIntegralImage();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input  Source tensor. Data types supported: U8.
-     * @param[out] output Destination tensor, Data types supported: U32.
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: U8.
-     * @param[out] output          Destination tensor, Data types supported: U32.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    CLIntegralImageHorKernel  _integral_hor;  /**< Integral Image Horizontal kernel */
-    CLIntegralImageVertKernel _integral_vert; /**< Integral Image Vertical kernel */
-};
-}
-#endif /*ARM_COMPUTE_CLINTEGRALIMAGE_H */
diff --git a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
index 91c547b2cc..a8b356a708 100644
--- a/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLL2NormalizeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_CLL2NORMALIZELAYER_H
 #define ARM_COMPUTE_CLL2NORMALIZELAYER_H
 
-#include "arm_compute/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
@@ -37,7 +36,10 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLL2NormalizeLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform a L2 normalization on a given axis.
  *
@@ -50,9 +52,29 @@ class CLL2NormalizeLayer : public IFunction
 public:
     /** Constructor */
     CLL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLL2NormalizeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLL2NormalizeLayer(const CLL2NormalizeLayer &) = delete;
+    /** Default move constructor */
+    CLL2NormalizeLayer(CLL2NormalizeLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLL2NormalizeLayer &operator=(const CLL2NormalizeLayer &) = delete;
+    /** Default move assignment operator */
+    CLL2NormalizeLayer &operator=(CLL2NormalizeLayer &&) = default;
 
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
+     *
      * @param[in]  input   Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[out] output  Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
@@ -67,7 +89,8 @@ public:
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         (Optional) Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
+    void configure(
+        const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon = 1e-12f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayer.
      *
@@ -84,10 +107,10 @@ public:
     void run() override;
 
 private:
-    MemoryGroup              _memory_group;
-    CLReductionOperation     _reduce_func;
-    CLL2NormalizeLayerKernel _normalize_kernel;
-    CLTensor                 _sumsq;
+    MemoryGroup                               _memory_group;
+    CLReductionOperation                      _reduce_func;
+    std::unique_ptr<CLL2NormalizeLayerKernel> _normalize_kernel;
+    CLTensor                                  _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index a29513aaae..fe494991af 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,30 +24,35 @@
 #ifndef ARM_COMPUTE_CLLSTMLAYER_H
 #define ARM_COMPUTE_CLLSTMLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
 
 #include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+namespace opencl
+{
+namespace kernels
+{
+class ClTransposeKernel;
+}
+} // namespace opencl
 
 /** This function performs a single time step in a Long Short-Term Memory (LSTM) layer.
  *
@@ -57,8 +62,27 @@ class CLLSTMLayer : public IFunction
 public:
     /** Default constructor */
     CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLLSTMLayer(const CLLSTMLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLLSTMLayer &operator=(const CLLSTMLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLLSTMLayer(CLLSTMLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLLSTMLayer &operator=(CLLSTMLayer &&) = delete;
+    /** Default destructor */
+    ~CLLSTMLayer();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src13 | dst0 - dst3 |
+     * |:------------|:------------|
+     * |F16          |F16          |
+     * |F32          |F32          |
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input.
      * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input.
@@ -95,13 +119,26 @@ public:
      * @param[in]  projection_threshold        (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0f then clipping is disabled.
      */
-    void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
-                   ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                   const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   const ICLTensor             *output_state_in,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *scratch_buffer,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output,
+                   const LSTMParams<ICLTensor> &lstm_params,
+                   const ActivationLayerInfo   &activation_info,
+                   float                        cell_threshold       = 0.f,
+                   float                        projection_threshold = 0.f);
     /** Initialize function's tensors.
      *
      * @param[in]  compile_context             The compile context to be used.
@@ -141,13 +178,27 @@ public:
      * @param[in]  projection_threshold        (Optional) The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0f then clipping is disabled.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *output_state_in, const ICLTensor *cell_state_in,
-                   ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                   const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   const ICLTensor             *output_state_in,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *scratch_buffer,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output,
+                   const LSTMParams<ICLTensor> &lstm_params,
+                   const ActivationLayerInfo   &activation_info,
+                   float                        cell_threshold       = 0.f,
+                   float                        projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayer
      *
@@ -189,103 +240,117 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                          _memory_group;
-    CLFullyConnectedLayer                _fully_connected_input_gate;
-    CLArithmeticAddition                 _accum_input_gate1;
-    CLSaturatedArithmeticOperationKernel _subtract_input_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_gate;
-    CLActivationLayerKernel              _activation_input_gate;
-    CLFullyConnectedLayer                _fully_connected_forget_gate;
-    CLArithmeticAddition                 _accum_forget_gate1;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_gate;
-    CLActivationLayerKernel              _activation_forget_gate;
-    CLFullyConnectedLayer                _fully_connected_cell_state;
-    CLGEMM                               _gemm_cell_state1;
-    CLTransposeKernel                    _transpose_cell_state;
-    CLSaturatedArithmeticOperationKernel _accum_cell_state1;
-    CLSaturatedArithmeticOperationKernel _accum_cell_state2;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_state1;
-    CLActivationLayerKernel              _activation_cell_state;
-    CLActivationLayerKernel              _cell_clip;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_state2;
-    CLFullyConnectedLayer                _fully_connected_output;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_state1;
-    CLArithmeticAddition                 _accum_output1;
-    CLActivationLayerKernel              _activation_output;
-    CLActivationLayerKernel              _activation_output_state;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_state2;
-    CLFullyConnectedLayer                _fully_connected_output_state;
-    CLActivationLayerKernel              _projection_clip;
-    CLCopyKernel                         _copy_cell_state;
-    CLCopyKernel                         _copy_output;
-    CLConcatenateLayer                   _concat_scratch_buffer;
-    CLWidthConcatenate2TensorsKernel     _concat_inputs_forget_gate;
-    CLWidthConcatenate2TensorsKernel     _concat_weights_forget_gate;
-    CLWidthConcatenate2TensorsKernel     _concat_weights_input_gate;
-    CLWidthConcatenate2TensorsKernel     _concat_weights_output;
-    CLMemsetKernel                       _ones_memset_kernel;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_input_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_input_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_forget_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_forget_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_cell_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_cell_gate_bias;
-    CLMeanStdDevNormalizationLayer       _mean_std_norm_output_gate;
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_output_gate_coeff;
-    CLSaturatedArithmeticOperationKernel _accum_output_gate_bias;
-    CLTensor                             _input_gate_out1;
-    CLTensor                             _input_gate_out2;
-    CLTensor                             _input_gate_out3;
-    CLTensor                             _input_gate_out4;
-    CLTensor                             _forget_gate_out1;
-    CLTensor                             _forget_gate_out2;
-    CLTensor                             _forget_gate_out3;
-    CLTensor                             _forget_gate_out4;
-    CLTensor                             _forget_gate_out5;
-    CLTensor                             _forget_gate_out6;
-    CLTensor                             _cell_state_out1;
-    CLTensor                             _cell_state_out2;
-    CLTensor                             _cell_state_out3;
-    CLTensor                             _cell_state_out4;
-    CLTensor                             _cell_state_out5;
-    CLTensor                             _output1;
-    CLTensor                             _output2;
-    CLTensor                             _output3;
-    CLTensor                             _output4;
-    CLTensor                             _cell_state_activation;
-    CLTensor                             _output_state1;
-    CLTensor                             _ones;
-    CLTensor                             _input_layer_norm_out1;
-    CLTensor                             _input_layer_norm_out2;
-    CLTensor                             _forget_layer_norm_out1;
-    CLTensor                             _forget_layer_norm_out2;
-    CLTensor                             _cell_layer_norm_out1;
-    CLTensor                             _cell_layer_norm_out2;
-    CLTensor                             _output_layer_norm_out1;
-    CLTensor                             _output_layer_norm_out2;
-    bool                                 _run_peephole_opt;
-    bool                                 _run_cifg_opt;
-    bool                                 _perform_cell_clipping;
-    bool                                 _has_projection_weights;
-    bool                                 _perform_projection_clipping;
-    bool                                 _is_prepared;
-    bool                                 _is_layer_norm_lstm;
+    MemoryGroup                                         _memory_group;
+    CLFullyConnectedLayer                               _fully_connected_input_gate;
+    CLArithmeticAddition                                _accum_input_gate1;
+    CLArithmeticSubtraction                             _subtract_input_gate;
+    CLPixelWiseMultiplication                           _pixelwise_mul_input_gate;
+    CLActivationLayer                                   _activation_input_gate;
+    CLFullyConnectedLayer                               _fully_connected_forget_gate;
+    CLArithmeticAddition                                _accum_forget_gate1;
+    CLPixelWiseMultiplication                           _pixelwise_mul_forget_gate;
+    CLActivationLayer                                   _activation_forget_gate;
+    CLFullyConnectedLayer                               _fully_connected_cell_state;
+    CLGEMM                                              _gemm_cell_state1;
+    std::unique_ptr<opencl::kernels::ClTransposeKernel> _transpose_cell_state;
+    CLArithmeticAddition                                _accum_cell_state1;
+    CLArithmeticAddition                                _accum_cell_state2;
+    CLPixelWiseMultiplication                           _pixelwise_mul_cell_state1;
+    CLActivationLayer                                   _activation_cell_state;
+    CLActivationLayer                                   _cell_clip;
+    CLPixelWiseMultiplication                           _pixelwise_mul_cell_state2;
+    CLFullyConnectedLayer                               _fully_connected_output;
+    CLPixelWiseMultiplication                           _pixelwise_mul_output_state1;
+    CLArithmeticAddition                                _accum_output1;
+    CLActivationLayer                                   _activation_output;
+    CLActivationLayer                                   _activation_output_state;
+    CLPixelWiseMultiplication                           _pixelwise_mul_output_state2;
+    CLFullyConnectedLayer                               _fully_connected_output_state;
+    CLActivationLayer                                   _projection_clip;
+    CLCopy                                              _copy_cell_state;
+    CLCopy                                              _copy_output;
+    CLConcatenateLayer                                  _concat_scratch_buffer;
+    CLConcatenateLayer                                  _concat_inputs_forget_gate;
+    CLConcatenateLayer                                  _concat_weights_forget_gate;
+    CLConcatenateLayer                                  _concat_weights_input_gate;
+    CLConcatenateLayer                                  _concat_weights_output;
+    CLFill                                              _ones_fill;
+    CLMeanStdDevNormalizationLayer                      _mean_std_norm_input_gate;
+    CLPixelWiseMultiplication                           _pixelwise_mul_input_gate_coeff;
+    CLArithmeticAddition                                _accum_input_gate_bias;
+    CLMeanStdDevNormalizationLayer                      _mean_std_norm_forget_gate;
+    CLPixelWiseMultiplication                           _pixelwise_mul_forget_gate_coeff;
+    CLArithmeticAddition                                _accum_forget_gate_bias;
+    CLMeanStdDevNormalizationLayer                      _mean_std_norm_cell_gate;
+    CLPixelWiseMultiplication                           _pixelwise_mul_cell_gate_coeff;
+    CLArithmeticAddition                                _accum_cell_gate_bias;
+    CLMeanStdDevNormalizationLayer                      _mean_std_norm_output_gate;
+    CLPixelWiseMultiplication                           _pixelwise_mul_output_gate_coeff;
+    CLArithmeticAddition                                _accum_output_gate_bias;
+    CLTensor                                            _input_gate_out1;
+    CLTensor                                            _input_gate_out2;
+    CLTensor                                            _input_gate_out3;
+    CLTensor                                            _input_gate_out4;
+    CLTensor                                            _forget_gate_out1;
+    CLTensor                                            _forget_gate_out2;
+    CLTensor                                            _forget_gate_out3;
+    CLTensor                                            _forget_gate_out4;
+    CLTensor                                            _forget_gate_out5;
+    CLTensor                                            _forget_gate_out6;
+    CLTensor                                            _cell_state_out1;
+    CLTensor                                            _cell_state_out2;
+    CLTensor                                            _cell_state_out3;
+    CLTensor                                            _cell_state_out4;
+    CLTensor                                            _cell_state_out5;
+    CLTensor                                            _output1;
+    CLTensor                                            _output2;
+    CLTensor                                            _output3;
+    CLTensor                                            _output4;
+    CLTensor                                            _cell_state_activation;
+    CLTensor                                            _output_state1;
+    CLTensor                                            _ones;
+    CLTensor                                            _input_layer_norm_out1;
+    CLTensor                                            _input_layer_norm_out2;
+    CLTensor                                            _forget_layer_norm_out1;
+    CLTensor                                            _forget_layer_norm_out2;
+    CLTensor                                            _cell_layer_norm_out1;
+    CLTensor                                            _cell_layer_norm_out2;
+    CLTensor                                            _output_layer_norm_out1;
+    CLTensor                                            _output_layer_norm_out2;
+    bool                                                _run_peephole_opt;
+    bool                                                _run_cifg_opt;
+    bool                                                _perform_cell_clipping;
+    bool                                                _has_projection_weights;
+    bool                                                _perform_projection_clipping;
+    bool                                                _is_prepared;
+    bool                                                _is_layer_norm_lstm;
+    const ICLTensor                                    *_recurrent_to_cell_weights{nullptr};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
index 082fdb4499..8c116b1482 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,6 @@
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
@@ -47,16 +46,16 @@ class ICLTensor;
  *
  * This function calls the following CL functions/kernels:
  *
- * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref CLTranspose                                           Matrix transpose
- * -# @ref CLConcatenateLayer                                    Tensor concatenation
- * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref CLArithmeticAddition                                  Elementwise addition
- * -# @ref CLPixelWiseMultiplication                             Elementwise multiplication
- * -# @ref CLSlice                                               Tensor slicing
- * -# @ref CLDequantizationLayer                                 Dequantize into float
- * -# @ref CLQuantizationLayer                                   Quantize from float
+ * -# @ref CLGEMMLowpMatrixMultiplyCore      Quantized matrix multiplication core. Accumulators are 32-bit integers
+ * -# @ref CLGEMMLowpOutputStage             Convert 32-bit integers into QSYMM16
+ * -# @ref CLTranspose                       Matrix transpose
+ * -# @ref CLConcatenateLayer                Tensor concatenation
+ * -# @ref CLActivationLayer                 Activation functions (tanh and logistic)
+ * -# @ref CLArithmeticAddition              Elementwise addition
+ * -# @ref CLPixelWiseMultiplication         Elementwise multiplication
+ * -# @ref CLSlice                           Tensor slicing
+ * -# @ref CLDequantizationLayer             Dequantize into float
+ * -# @ref CLQuantizationLayer               Quantize from float
  * */
 class CLLSTMLayerQuantized : public IFunction
 {
@@ -73,6 +72,14 @@ public:
     CLLSTMLayerQuantized &operator=(CLLSTMLayerQuantized &&) = default;
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src8 |src9 - src12 |src13   |src14  |dst0   |dst1   |
+     * |:-----------|:------------|:-------|:------|:------|:------|
+     * |QASYMM8     |S32          |QSYMM16 |QASYMM8|QSYMM16|QASYMM8|
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
      * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
@@ -92,11 +99,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+                   const ICLTensor *input_to_input_weights,
+                   const ICLTensor *input_to_forget_weights,
+                   const ICLTensor *input_to_cell_weights,
+                   const ICLTensor *input_to_output_weights,
+                   const ICLTensor *recurrent_to_input_weights,
+                   const ICLTensor *recurrent_to_forget_weights,
+                   const ICLTensor *recurrent_to_cell_weights,
+                   const ICLTensor *recurrent_to_output_weights,
+                   const ICLTensor *input_gate_bias,
+                   const ICLTensor *forget_gate_bias,
+                   const ICLTensor *cell_bias,
+                   const ICLTensor *output_gate_bias,
+                   ICLTensor       *cell_state_in,
+                   const ICLTensor *output_state_in,
+                   ICLTensor       *cell_state_out,
+                   ICLTensor       *output_state_out);
     /** Initialize function's tensors.
      *
      * @param[in]  compile_context             The compile context to be used.
@@ -118,12 +136,24 @@ public:
      * @param[out] cell_state_out              Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size]. Data type supported:  QSYMM16.
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *input_to_input_weights,
+                   const ICLTensor        *input_to_forget_weights,
+                   const ICLTensor        *input_to_cell_weights,
+                   const ICLTensor        *input_to_output_weights,
+                   const ICLTensor        *recurrent_to_input_weights,
+                   const ICLTensor        *recurrent_to_forget_weights,
+                   const ICLTensor        *recurrent_to_cell_weights,
+                   const ICLTensor        *recurrent_to_output_weights,
+                   const ICLTensor        *input_gate_bias,
+                   const ICLTensor        *forget_gate_bias,
+                   const ICLTensor        *cell_bias,
+                   const ICLTensor        *output_gate_bias,
+                   ICLTensor              *cell_state_in,
+                   const ICLTensor        *output_state_in,
+                   ICLTensor              *cell_state_out,
+                   ICLTensor              *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLLSTMLayerQuantized
      *
@@ -148,11 +178,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
@@ -162,30 +203,30 @@ private:
     MemoryGroup _memory_group;
 
     // Functions used
-    CLGEMMLowpMatrixMultiplyCore                        _gemmlowp;
-    CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
-    CLTranspose                                         _transpose_weights;
-    CLConcatenateLayer                                  _concat_input_weights;
-    CLConcatenateLayer                                  _concat_recurrent_weights;
-    CLConcatenateLayer                                  _concat_weights;
-    CLConcatenateLayer                                  _concat_inputs;
-    CLConcatenateLayer                                  _concat_bias;
-    CLActivationLayer                                   _sigmoid_forget_gate;
-    CLActivationLayer                                   _sigmoid_input_gate;
-    CLActivationLayer                                   _sigmoid_output_gate;
-    CLActivationLayer                                   _tanh_modulation_gate;
-    CLActivationLayer                                   _tanh_output_state;
-    CLArithmeticAddition                                _add_cell_state_tmps;
-    CLArithmeticAddition                                _add2;
-    CLPixelWiseMultiplication                           _mul_forget_gate_cell_state;
-    CLPixelWiseMultiplication                           _mul_input_gate_input_mod_gate;
-    CLPixelWiseMultiplication                           _mul_output_state_tmp_output_gate;
-    CLSlice                                             _slice_input_tensor;
-    CLSlice                                             _slice_forget_tensor;
-    CLSlice                                             _slice_cell_tensor;
-    CLSlice                                             _slice_output_tensor;
-    CLDequantizationLayer                               _dequantize;
-    CLQuantizationLayer                                 _quantize;
+    CLGEMMLowpMatrixMultiplyCore _gemmlowp;
+    CLGEMMLowpOutputStage        _output_stage;
+    CLTranspose                  _transpose_weights;
+    CLConcatenateLayer           _concat_input_weights;
+    CLConcatenateLayer           _concat_recurrent_weights;
+    CLConcatenateLayer           _concat_weights;
+    CLConcatenateLayer           _concat_inputs;
+    CLConcatenateLayer           _concat_bias;
+    CLActivationLayer            _sigmoid_forget_gate;
+    CLActivationLayer            _sigmoid_input_gate;
+    CLActivationLayer            _sigmoid_output_gate;
+    CLActivationLayer            _tanh_modulation_gate;
+    CLActivationLayer            _tanh_output_state;
+    CLArithmeticAddition         _add_cell_state_tmps;
+    CLArithmeticAddition         _add2;
+    CLPixelWiseMultiplication    _mul_forget_gate_cell_state;
+    CLPixelWiseMultiplication    _mul_input_gate_input_mod_gate;
+    CLPixelWiseMultiplication    _mul_output_state_tmp_output_gate;
+    CLSlice                      _slice_input_tensor;
+    CLSlice                      _slice_forget_tensor;
+    CLSlice                      _slice_cell_tensor;
+    CLSlice                      _slice_output_tensor;
+    CLDequantizationLayer        _dequantize;
+    CLQuantizationLayer          _quantize;
 
     // Tensor pointers
     const ICLTensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h b/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
deleted file mode 100644
index 49a87baaf2..0000000000
--- a/arm_compute/runtime/CL/functions/CLLaplacianPyramid.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLAPLACIANPYRAMID_H
-#define ARM_COMPUTE_CLLAPLACIANPYRAMID_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLGaussian5x5.h"
-#include "arm_compute/runtime/CL/functions/CLGaussianPyramid.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute laplacian pyramid. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLGaussianPyramidHalf
- * -# @ref CLGaussian5x5
- * -# @ref CLArithmeticSubtraction
- *
- *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
- *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
- *  L(i) = I(i) - Gaussian5x5(I(i))
- *  Level 0 has always the same first two dimensions as the input tensor.
-*/
-class CLLaplacianPyramid : public IFunction
-{
-public:
-    /** Constructor */
-    CLLaplacianPyramid();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data types supported: U8.
-     * @param[out] pyramid               Destination pyramid tensors, Data types supported at each level: S16.
-     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16.
-     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
-     *                                   output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
-     * @param[in]  border_mode           Border mode to use.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  input                 Source tensor. Data types supported: U8.
-     * @param[out] pyramid               Destination pyramid tensors, Data types supported at each level: S16.
-     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data types supported: S16.
-     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
-     *                                   output.width = input.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
-     * @param[in]  border_mode           Border mode to use.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, CLPyramid *pyramid, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    size_t                               _num_levels;
-    CLGaussianPyramidHalf                _gaussian_pyr_function;
-    std::vector<CLGaussian5x5>           _convf;
-    std::vector<CLArithmeticSubtraction> _subf;
-    CLDepthConvertLayer                  _depth_function;
-    CLPyramid                            _gauss_pyr;
-    CLPyramid                            _conv_pyr;
-};
-}
-#endif /*ARM_COMPUTE_CLLAPLACIANPYRAMID_H */
diff --git a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h b/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
deleted file mode 100644
index 2c7afde7de..0000000000
--- a/arm_compute/runtime/CL/functions/CLLaplacianReconstruct.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H
-#define ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLPyramid.h"
-#include "arm_compute/runtime/CL/functions/CLDepthConvertLayer.h"
-#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
-#include "arm_compute/runtime/CL/functions/CLScale.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute laplacian reconstruction. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLArithmeticAddition
- * -# @ref CLScale
- * -# @ref CLDepthConvertLayer
- *
- * This function reconstructs the original image from a Laplacian Image Pyramid.
- *
- *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
- *  resolution of the next pyramid level.
- *
- *  I(n-2) = upsample( input + L(n-1)
- *
- *  For each pyramid level i, except i=0 and i=n-1:
- *  I(i-1) = upsample(I(i) + L(i))
- *
- *  output = I(0) + L(0)
-*/
-class CLLaplacianReconstruct : public IFunction
-{
-public:
-    /** Constructor */
-    CLLaplacianReconstruct();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * The Output image must have the same size as the first level of the pyramid.
-     * The Input image must have the same size as the last level of the pyramid.
-     *
-     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
-     *
-     * @param[in]  pyramid               Laplacian pyramid tensors, Data types supported at each level: S16.
-     * @param[in]  input                 Source tensor. Data types supported: S16.
-     * @param[out] output                Output tensor. Data types supported: U8.
-     * @param[in]  border_mode           Border mode to use for the convolution.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * The Output image must have the same size as the first level of the pyramid.
-     * The Input image must have the same size as the last level of the pyramid.
-     *
-     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  pyramid               Laplacian pyramid tensors, Data types supported at each level: S16.
-     * @param[in]  input                 Source tensor. Data types supported: S16.
-     * @param[out] output                Output tensor. Data types supported: U8.
-     * @param[in]  border_mode           Border mode to use for the convolution.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const CLPyramid *pyramid, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLPyramid                         _tmp_pyr;
-    std::vector<CLArithmeticAddition> _addf;
-    std::vector<CLScale>              _scalef;
-    CLDepthConvertLayer               _depthf;
-};
-}
-#endif /*ARM_COMPUTE_CLLAPLACIANRECONSTRUCT_H */
diff --git a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
deleted file mode 100644
index 7a43eab478..0000000000
--- a/arm_compute/runtime/CL/functions/CLLocallyConnectedLayer.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H
-#define ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLCol2ImKernel.h"
-#include "arm_compute/core/CL/kernels/CLIm2ColKernel.h"
-#include "arm_compute/core/CL/kernels/CLLocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to compute the locally connected layer. This function calls the following OpenCL kernels:
- *
- * -# @ref CLWeightsReshapeKernel (executed only once for each configuration)
- * -# @ref CLIm2ColKernel
- * -# @ref CLLocallyConnectedMatrixMultiplyKernel
- * -# @ref CLCol2ImKernel
- */
-class CLLocallyConnectedLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    CLLocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLocallyConnectedLayer(const CLLocallyConnectedLayer &) = delete;
-    /** Default move constructor */
-    CLLocallyConnectedLayer(CLLocallyConnectedLayer &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLLocallyConnectedLayer &operator=(const CLLocallyConnectedLayer &) = delete;
-    /** Default move assignment operator */
-    CLLocallyConnectedLayer &operator=(CLLocallyConnectedLayer &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: F32.
-     * @param[in]  weights         Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[out] output          Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLLocallyConnectedLayer
-     *
-     * @param[in] input     Input tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs.
-     *                      Data types supported: F32.
-     * @param[in] weights   Weights tensor info. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in] biases    Biases tensor info. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in] output    Output tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                      Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    MemoryGroup                            _memory_group;
-    CLIm2ColKernel                         _input_im2col_kernel;
-    CLWeightsReshapeKernel                 _weights_reshape_kernel;
-    CLLocallyConnectedMatrixMultiplyKernel _mm_kernel;
-    CLCol2ImKernel                         _output_col2im_kernel;
-    CLTensor                               _input_im2col_reshaped;
-    CLTensor                               _weights_reshaped;
-    CLTensor                               _gemm_output;
-    bool                                   _is_prepared;
-    const ICLTensor                       *_original_weights;
-};
-}
-#endif /* ARM_COMPUTE_CLLOCALLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLLogicalAnd.h b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
new file mode 100644
index 0000000000..4ff488782a
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLogicalAnd.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLOGICALAND_H
+#define ARM_COMPUTE_CLLOGICALAND_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a logical AND operation using the two input tensors.
+ */
+class CLLogicalAnd : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLLogicalAnd();
+    /** Default Destructor */
+    ~CLLogicalAnd();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalAnd(const CLLogicalAnd &) = delete;
+    /** Default move constructor */
+    CLLogicalAnd(CLLogicalAnd &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalAnd &operator=(const CLLogicalAnd &) = delete;
+    /** Default move assignment operator */
+    CLLogicalAnd &operator=(CLLogicalAnd &&);
+    /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: same as @p input1.
+     * @param[out] output Output tensor. Data types supported: same as @p input1.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Input tensor. Data types supported: U8.
+     * @param[in]  input2          Input tensor. Data types supported: same as @p input1.
+     * @param[out] output          Output tensor. Data types supported: same as @p input1.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+class CLLogicalAnd : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLLogicalAnd() = default;
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLLOGICALAND_H */
diff --git a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h b/arm_compute/runtime/CL/functions/CLLogicalNot.h
index bb154f1a5b..c7d9db93d7 100644
--- a/arm_compute/core/CL/kernels/CLDequantizationLayerKernel.h
+++ b/arm_compute/runtime/CL/functions/CLLogicalNot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,59 +21,74 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H
+#ifndef ARM_COMPUTE_CLLOGICALNOT_H
+#define ARM_COMPUTE_CLLOGICALNOT_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Interface for the dequantization layer kernel. */
-class CLDequantizationLayerKernel : public ICLKernel
+/** Basic function to do logical NOT operation
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a logical NOT operation on input tensor.
+ */
+class CLLogicalNot : public IFunction
 {
 public:
-    /** Default constructor */
-    CLDequantizationLayerKernel();
+    /** Default Constructor */
+    CLLogicalNot();
+    /** Default Destructor */
+    ~CLLogicalNot();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDequantizationLayerKernel(const CLDequantizationLayerKernel &) = delete;
+    CLLogicalNot(const CLLogicalNot &) = delete;
+    /** Default move constructor */
+    CLLogicalNot(CLLogicalNot &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLDequantizationLayerKernel &operator=(const CLDequantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    CLDequantizationLayerKernel(CLDequantizationLayerKernel &&) = default;
+    CLLogicalNot &operator=(const CLLogicalNot &) = delete;
     /** Default move assignment operator */
-    CLDequantizationLayerKernel &operator=(CLDequantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLDequantizationLayerKernel() = default;
-    /** Set the input, output, min and max.
+    CLLogicalNot &operator=(CLLogicalNot &&);
+    /** Initialize the function
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output Destination tensor. Data types supported: F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst          |
+     * |:--------------|:------------|
+     * |U8             |U8           |
+     *
+     * @param[in]  input  Input tensor. Data types supported: U8.
+     * @param[out] output Output tensor. Data types supported: same as @p input.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
-    /** Set the input, output, min and max.
+    /** Initialize the function
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[out] output          Destination tensor. Data types supported: F16/F32.
+     * @param[in]  input           Input tensor. Data types supported: U8.
+     * @param[out] output          Output tensor. Data types supported: same as @p input.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDequantizationLayerKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
-     * @param[in] output Output tensor info. Data types supported: F16/F32.
+     * @param[in] input  Tensor input info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data types supported: same as @p input.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void          run() override;
 
 private:
-    const ICLTensor *_input;
-    ICLTensor       *_output;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CLDEQUANTIZATIONLAYERKERNEL_H */
+#endif /* ARM_COMPUTE_CLLOGICALNOT_H */
diff --git a/arm_compute/runtime/CL/functions/CLLogicalOr.h b/arm_compute/runtime/CL/functions/CLLogicalOr.h
new file mode 100644
index 0000000000..64b6d83177
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLLogicalOr.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLLOGICALOR_H
+#define ARM_COMPUTE_CLLOGICALOR_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel.
+ *
+ * @note The tensor data type for the inputs must be U8.
+ * @note The function performs a logical OR operation using the two input tensors.
+ */
+class CLLogicalOr : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLLogicalOr();
+    /** Default Destructor */
+    ~CLLogicalOr();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalOr(const CLLogicalOr &) = delete;
+    /** Default move constructor */
+    CLLogicalOr(CLLogicalOr &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLLogicalOr &operator=(const CLLogicalOr &) = delete;
+    /** Default move assignment operator */
+    CLLogicalOr &operator=(CLLogicalOr &&);
+    /** Initialize the function
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
+     *
+     * @param[in]  input1 Input tensor. Data types supported: U8.
+     * @param[in]  input2 Input tensor. Data types supported: same as @p input1.
+     * @param[out] output Output tensor. Data types supported: same as @p input1.
+     */
+    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input1          Input tensor. Data types supported: U8.
+     * @param[in]  input2          Input tensor. Data types supported: same as @p input1.
+     * @param[out] output          Output tensor. Data types supported: same as @p input1.
+     */
+    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+class CLLogicalOr : public ICLOperator
+{
+public:
+    /** Default Constructor */
+    CLLogicalOr() = default;
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] input1          First tensor input. Data types supported: U8.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] input2          Second tensor input. Data types supported: same as @p input1.
+     *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     output          Output tensor. Data types supported: same as @p input1.
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref arm_compute::opencl::kernels::ClLogicalBinaryKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: U8.
+     * @param[in] input2 Second tensor input info. Data types supported: same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLLOGICALOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLMagnitude.h b/arm_compute/runtime/CL/functions/CLMagnitude.h
deleted file mode 100644
index e52ab240e4..0000000000
--- a/arm_compute/runtime/CL/functions/CLMagnitude.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMAGNITUDE_H
-#define ARM_COMPUTE_CLMAGNITUDE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLMagnitudePhaseKernel. */
-class CLMagnitude : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  input1   First tensor input. Data types supported: S16.
-     * @param[in]  input2   Second tensor input. Data types supported: S16.
-     * @param[out] output   Output tensor. Data types supported: S16.
-     * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First tensor input. Data types supported: S16.
-     * @param[in]  input2          Second tensor input. Data types supported: S16.
-     * @param[out] output          Output tensor. Data types supported: S16.
-     * @param[in]  mag_type        (Optional) Magnitude calculation type. Default: L2NORM.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
-};
-}
-#endif /*ARM_COMPUTE_CLMAGNITUDE_H */
diff --git a/arm_compute/runtime/CL/functions/CLMatMul.h b/arm_compute/runtime/CL/functions/CLMatMul.h
new file mode 100644
index 0000000000..9c9939b9d0
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMatMul.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations for used types instead of including their header, that could minimize compile time
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+class MatMulInfo;
+class Status;
+
+/** Settings for MatMul OpenCL implementation */
+class GpuMatMulSettings
+{
+public:
+    /* Placeholder for operator parity between CPU/GPU */
+};
+
+/** Basic function to execute MatMul (Matrix Multiplication) on OpenCL */
+class CLMatMul : public IFunction
+{
+public:
+    /** Default constructor.*/
+    CLMatMul();
+    /** Default destructor */
+    ~CLMatMul();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMatMul(const CLMatMul &) = delete;
+    /** Default move constructor */
+    CLMatMul(CLMatMul &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMatMul &operator=(const CLMatMul &) = delete;
+    /** Default move assignment operator */
+    CLMatMul &operator=(CLMatMul &&);
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     *
+     * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
+     *                    and stores the result in the dst tensor of the same batch size.
+     *                    Batch here is number of slices from A and B multiplied at a time, do not confuse with the batch dimension 'N' of NHWC/NCHW
+     *                    For NHWC for example: the batch is the higher dimensions H * N, and in general it is H * all higher dimensions.
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  lhs             Left-hand side tensor info containing the input activations as Matrix A. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs             Right-hand side tensor info containing the input weights as Matrix B. Data types supported: same as @p lhs.
+     * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings        Contains flags for function level settings
+     * @param[in]  act_info        (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *rhs,
+                   ICLTensor                 *lhs,
+                   ICLTensor                 *dst,
+                   const MatMulInfo          &matmul_info,
+                   const GpuMatMulSettings   &settings = GpuMatMulSettings{},
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    /** Initialise the kernel's inputs and output
+     *
+     * Similar to @ref CLMatMul::configure()
+     */
+    void configure(ICLTensor                 *lhs,
+                   ICLTensor                 *rhs,
+                   ICLTensor                 *dst,
+                   const MatMulInfo          &matmul_info,
+                   const GpuMatMulSettings   &settings = GpuMatMulSettings{},
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMatMul.
+     *
+     *
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  lhs         Left-hand side (Matrix A) tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs         Right-hand side (Matrix B) tensor info. Data types supported: same as @p lhs.
+     * @param[out] output      Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  act_info    (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *output,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo{});
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+
+#endif /* ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLMATMUL */
diff --git a/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
new file mode 100644
index 0000000000..2d2f064b4c
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLMaxUnpoolingLayer.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
+#define ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+class CLMaxUnpoolingLayerKernel;
+struct PoolingLayerInfo;
+
+/** Function to perform MaxUnpooling. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFill
+ * -# @ref CLMaxUnpoolingLayerKernel
+ */
+class CLMaxUnpoolingLayer : public IFunction
+{
+public:
+    /** Constructor */
+    CLMaxUnpoolingLayer();
+    /** Prevent instances of this class from being copied */
+    CLMaxUnpoolingLayer(const CLMaxUnpoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLMaxUnpoolingLayer &operator=(const CLMaxUnpoolingLayer &) = delete;
+    /** Default destructor */
+    ~CLMaxUnpoolingLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices   Tensor containing the offset to store the input elements in the output tensor.
+     *                       @ref CLPoolingLayer with indices should precede this function in order to
+     *                       properly reconstruct the output tensor.
+     *                       The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    /** Set the input and output tensors.
+     *
+     * @note Output shape must be equal to the shape of the original input to pool.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  indices         Tensor containing the offset to store the input elements in the output tensor.
+     *                             @ref CLPoolingLayer with indices should precede this function in order to
+     *                             properly reconstruct the output tensor.
+     *                             The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[out] output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayer
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] indices   TensorInfo associated to the tensor containing the offset to store the input elements in the output tensor.
+     *                      @ref CLPoolingLayer with indices should precede this function in order to
+     *                      properly reconstruct the output tensor.
+     *                      The tensor shape of this tensor has to be equal to the input tensor shape. Data type supported: U32.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    CLFill                                     _fill;
+    std::unique_ptr<CLMaxUnpoolingLayerKernel> _unpooling_layer_kernel;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
deleted file mode 100644
index 561ac04f1d..0000000000
--- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEANSTDDEV_H
-#define ARM_COMPUTE_CLMEANSTDDEV_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
-#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-/** Basic function to execute mean and standard deviation by calling @ref CLMeanStdDevKernel */
-class CLMeanStdDev : public IFunction
-{
-public:
-    /** Default Constructor. */
-    CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDev(const CLMeanStdDev &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMeanStdDev &operator=(const CLMeanStdDev &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDev(CLMeanStdDev &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMeanStdDev &operator=(CLMeanStdDev &&) = default;
-    /** Default destructor */
-    ~CLMeanStdDev() = default;
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @param[in, out] input  Input image. Data types supported: U8/F16/F32. (Written to only for border filling)
-     * @param[out]     mean   Output average pixel value.
-     * @param[out]     stddev (Optional) Output standard deviation of pixel values.
-     */
-    void configure(ICLImage *input, float *mean, float *stddev = nullptr);
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Input image. Data types supported: U8/F16/F32. (Written to only for border filling)
-     * @param[out]     mean            Output average pixel value.
-     * @param[out]     stddev          (Optional) Output standard deviation of pixel values.
-     */
-    void configure(const CLCompileContext &compile_context, ICLImage *input, float *mean, float *stddev = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDev
-     *
-     * @param[in] input  Input image. Data types supported: U8/F16/F32.
-     * @param[in] mean   Output average pixel value.
-     * @param[in] stddev (Optional) Output standard deviation of pixel values.
-     *
-     * @return a status
-     */
-    static Status validate(ITensorInfo *input, float *mean, float *stddev = nullptr);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    template <typename T>
-    void run_float();
-    void run_int();
-
-    MemoryGroup          _memory_group;               /**< Function's memory group */
-    DataType             _data_type;                  /**< Input data type. */
-    unsigned int         _num_pixels;                 /**< Number of image's pixels. */
-    bool                 _run_stddev;                 /**< Flag for knowing if we should run stddev reduction function. */
-    CLReductionOperation _reduction_operation_mean;   /**< Reduction operation function for computing mean value. */
-    CLReductionOperation _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */
-    CLTensor             _reduction_output_mean;      /**< Reduction operation output tensor for mean value. */
-    CLTensor             _reduction_output_stddev;    /**< Reduction operation output tensor for standard deviation value. */
-    float               *_mean;                       /**< Pointer that holds the mean value. */
-    float               *_stddev;                     /**< Pointer that holds the standard deviation value. */
-    CLMeanStdDevKernel   _mean_stddev_kernel;         /**< Kernel that standard deviation calculation. */
-    CLFillBorderKernel   _fill_border_kernel;         /**< Kernel that fills the border with zeroes. */
-    cl::Buffer           _global_sum;                 /**< Variable that holds the global sum among calls in order to ease reduction */
-    cl::Buffer           _global_sum_squared;         /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
-};
-}
-#endif /*ARM_COMPUTE_CLMEANSTDDEV_H */
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
index e39a5908b8..951db3e419 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute mean and standard deviation normalization by calling @ref CLMeanStdDevNormalizationKernel */
 class CLMeanStdDevNormalizationLayer : public ICLSimpleFunction
@@ -37,6 +39,16 @@ class CLMeanStdDevNormalizationLayer : public ICLSimpleFunction
 public:
     /** Initialise the function's input and outputs.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
+     *
      * @note If the output tensor is a nullptr, the normalization will be performed in-place.
      *
      * @param[in, out] input   Input tensor with 2 dimensions. Data types supported: F16/F32.
@@ -53,7 +65,10 @@ public:
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,
diff --git a/arm_compute/runtime/CL/functions/CLMedian3x3.h b/arm_compute/runtime/CL/functions/CLMedian3x3.h
deleted file mode 100644
index f3bb2832ef..0000000000
--- a/arm_compute/runtime/CL/functions/CLMedian3x3.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMEDIAN3X3_H
-#define ARM_COMPUTE_CLMEDIAN3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute median filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLMedian3x3Kernel
- *
- */
-class CLMedian3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLMEDIAN3X3_H */
diff --git a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h b/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
deleted file mode 100644
index e9e3bd910c..0000000000
--- a/arm_compute/runtime/CL/functions/CLMinMaxLocation.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLMINMAXLOCATION_H
-#define ARM_COMPUTE_CLMINMAXLOCATION_H
-
-#include "arm_compute/core/CL/kernels/CLMinMaxLocationKernel.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-using ICLImage = ICLTensor;
-
-/** Basic function to execute min and max location. This function calls the following OpenCL kernels:
- *
- * -# @ref CLMinMaxKernel
- * -# @ref CLMinMaxLocationKernel
- */
-class CLMinMaxLocation : public IFunction
-{
-public:
-    /** Constructor */
-    CLMinMaxLocation();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocation(const CLMinMaxLocation &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLMinMaxLocation &operator=(const CLMinMaxLocation &) = delete;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocation(CLMinMaxLocation &&) = default;
-    /** Allow instances of this class to be moved */
-    CLMinMaxLocation &operator=(CLMinMaxLocation &&) = default;
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  input     Input image. Data types supported: U8/S16/F32.
-     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc   (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc   (Optional) Array of Coordinates2D used to store maximum value locations.
-     * @param[out] min_count (Optional) Number of minimum value encounters.
-     * @param[out] max_count (Optional) Number of maximum value encounters.
-     */
-    void configure(const ICLImage *input, void *min, void *max,
-                   CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @note When locations of min and max occurrences are requested, the reported number of locations is limited to the given array size.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input image. Data types supported: U8/S16/F32.
-     * @param[out] min             Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max             Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc         (Optional) Array of Coordinates2D used to store minimum value locations.
-     * @param[out] max_loc         (Optional) Array of Coordinates2D used to store maximum value locations.
-     * @param[out] min_count       (Optional) Number of minimum value encounters.
-     * @param[out] max_count       (Optional) Number of maximum value encounters.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLImage *input, void *min, void *max,
-                   CLCoordinates2DArray *min_loc = nullptr, CLCoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLMinMaxKernel         _min_max_kernel;     /**< Kernel that performs min/max */
-    CLMinMaxLocationKernel _min_max_loc_kernel; /**< Kernel that counts min/max occurrences and identifies their positions */
-    cl::Buffer             _min_max_vals;       /**< Buffer to collect min, max values */
-    cl::Buffer             _min_max_count_vals; /**< Buffer to collect min, max values */
-    void                  *_min;                /**< Minimum value. */
-    void                  *_max;                /**< Maximum value. */
-    uint32_t              *_min_count;          /**< Minimum value occurrences. */
-    uint32_t              *_max_count;          /**< Maximum value occurrences. */
-    CLCoordinates2DArray *_min_loc;             /**< Minimum value occurrences coordinates. */
-    CLCoordinates2DArray *_max_loc;             /**< Maximum value occurrences  coordinates. */
-};
-}
-#endif /*ARM_COMPUTE_CLMINMAXLOCATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h b/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
deleted file mode 100644
index 79f73ea370..0000000000
--- a/arm_compute/runtime/CL/functions/CLNonLinearFilter.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONLINEARFILTER_H
-#define ARM_COMPUTE_CLNONLINEARFILTER_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute non linear filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLNonLinearFilterKernel
- *
- * @note Supported mask dimensions squares of sizes 3, 5
- */
-class CLNonLinearFilter : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8
-     * @param[in]     function              Non linear function to perform
-     * @param[in]     mask_size             Mask size. Supported sizes: 3, 5
-     * @param[in]     pattern               Mask pattern
-     * @param[in]     mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8
-     * @param[in]     function              Non linear function to perform
-     * @param[in]     mask_size             Mask size. Supported sizes: 3, 5
-     * @param[in]     pattern               Mask pattern
-     * @param[in]     mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLNONLINEARFILTER_H */
diff --git a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h b/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
deleted file mode 100644
index e2c0c4f814..0000000000
--- a/arm_compute/runtime/CL/functions/CLNonMaximaSuppression3x3.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H
-#define ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following CL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLNonMaximaSuppression3x3Kernel
- */
-class CLNonMaximaSuppression3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *       The constant values used with CONSTANT border mode is 0
-     *
-     * @param[in,out] input       Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output      Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input.
-     * @param[in]     border_mode Border mode to use for non-maxima suppression.
-     *                                   The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     */
-    void configure(ICLTensor *input, ICLTensor *output, BorderMode border_mode);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *       The constant values used with CONSTANT border mode is 0
-     *
-     * @param[in]     compile_context The compile context to be used.
-     * @param[in,out] input           Source tensor. Data types supported: U8, F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output          Destination for the Non-Maxima suppressions 3x3. Data types supported: same as @p input.
-     * @param[in]     border_mode     Border mode to use for non-maxima suppression.
-     *                                   The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, BorderMode border_mode);
-};
-}
-#endif /* ARM_COMPUTE_CLNONMAXIMASUPPRESSION3X3_H */
diff --git a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
index 07bb62c7d7..10fd8ed4c6 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,19 @@
 #ifndef ARM_COMPUTE_CLNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_CLNORMALIZATIONLAYER_H
 
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLNormalizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLFillBorderKernel;
+class CLNormalizationLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to compute a normalization layer. This function calls the following CL kernels:
  *
@@ -48,8 +49,28 @@ class CLNormalizationLayer : public IFunction
 public:
     /** Default constructor */
     CLNormalizationLayer();
+    /** Prevent instances of this class from being copied */
+    CLNormalizationLayer(const CLNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLNormalizationLayer &operator=(const CLNormalizationLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLNormalizationLayer(CLNormalizationLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLNormalizationLayer &operator=(CLNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~CLNormalizationLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
+     *
      * @param[in, out] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           and an optional 4th dimension for batch of inputs. Data types supported: F16/F32 (Written to by the border handler).
      *                           Data layouts supported: NCHW/NHWC.
@@ -68,7 +89,10 @@ public:
      *                                 Data types supported: same as @p input. Data layouts supported: same as @p input.
      * @param[in]      norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info);
+    void configure(const CLCompileContext       &compile_context,
+                   ICLTensor                    *input,
+                   ICLTensor                    *output,
+                   const NormalizationLayerInfo &norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayer
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -79,14 +103,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLNormalizationLayerKernel _norm_kernel;    /**< Normalization layer kernel to run */
-    CLFillBorderKernel         _border_handler; /**< Kernel to handle  borders */
+    std::unique_ptr<CLNormalizationLayerKernel> _norm_kernel;    /**< Normalization layer kernel to run */
+    std::unique_ptr<CLFillBorderKernel>         _border_handler; /**< Kernel to handle  borders */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
index 5dd3760d3d..cdccc16a51 100644
--- a/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
+++ b/arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
-#define ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
@@ -31,7 +31,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLNormalizePlanarYUVLayerKernel
  *
@@ -42,6 +44,18 @@ class CLNormalizePlanarYUVLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     *
      * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
      *                    Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out] output Destinationfeature tensor. Data type supported: same as @p input
@@ -60,7 +74,11 @@ public:
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                    Data types supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -72,7 +90,8 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLNORMALIZEPLANARYUVLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLNORMALIZEPLANARYUVLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLOpticalFlow.h b/arm_compute/runtime/CL/functions/CLOpticalFlow.h
deleted file mode 100644
index 12d0583384..0000000000
--- a/arm_compute/runtime/CL/functions/CLOpticalFlow.h
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLOPTICALFLOW_H
-#define ARM_COMPUTE_CLOPTICALFLOW_H
-
-#include "arm_compute/core/CL/kernels/CLLKTrackerKernel.h"
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLArray.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLScharr3x3.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class CLPyramid;
-
-/** OpenCL Array of Internal Keypoints */
-using CLLKInternalKeypointArray = CLArray<CLLKInternalKeypoint>;
-/** OpenCL Array of Coefficient Tables */
-using CLCoefficientTableArray = CLArray<CLCoefficientTable>;
-/** OpenCL Array of Old Values */
-using CLOldValueArray = CLArray<CLOldValue>;
-
-/** Basic function to execute optical flow. This function calls the following OpenCL kernels and functions:
- *
- * -# @ref CLScharr3x3
- * -# @ref CLLKTrackerInitKernel
- * -# @ref CLLKTrackerStage0Kernel
- * -# @ref CLLKTrackerStage1Kernel
- * -# @ref CLLKTrackerFinalizeKernel
- */
-class CLOpticalFlow : public IFunction
-{
-public:
-    /** Default constructor */
-    CLOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLOpticalFlow(const CLOpticalFlow &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLOpticalFlow &operator=(const CLOpticalFlow &) = delete;
-    /** Allow instances of this class to be moved */
-    CLOpticalFlow(CLOpticalFlow &&) = default;
-    /** Allow instances of this class to be moved */
-    CLOpticalFlow &operator=(CLOpticalFlow &&) = default;
-    /**  Initialise the function input and output
-     *
-     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
-     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data types supported U8
-     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
-     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
-     * @param[in]  termination           The criteria to terminate the search of each keypoint.
-     * @param[in]  epsilon               The error for terminating the algorithm
-     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
-     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
-     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
-     * @param[in]  border_mode           The border mode applied at scharr kernel stage
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
-     *
-     */
-    void configure(const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                   const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-    /**  Initialise the function input and output
-     *
-     * @param[in]  compile_context       The compile context to be used.
-     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data types supported U8
-     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data types supported U8
-     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
-     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
-     * @param[in]  termination           The criteria to terminate the search of each keypoint.
-     * @param[in]  epsilon               The error for terminating the algorithm
-     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
-     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
-     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
-     * @param[in]  border_mode           The border mode applied at scharr kernel stage
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
-     *
-     */
-    void configure(const CLCompileContext &compile_context, const CLPyramid *old_pyramid, const CLPyramid *new_pyramid,
-                   const ICLKeyPointArray *old_points, const ICLKeyPointArray *new_points_estimates, ICLKeyPointArray *new_points,
-                   Termination termination, float epsilon, size_t num_iterations, size_t window_dimension, bool use_initial_estimate,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                _memory_group;
-    std::vector<CLLKTrackerInitKernel>         _tracker_init_kernel;
-    std::vector<CLLKTrackerStage0Kernel>       _tracker_stage0_kernel;
-    std::vector<CLLKTrackerStage1Kernel>       _tracker_stage1_kernel;
-    CLLKTrackerFinalizeKernel                  _tracker_finalize_kernel;
-    std::vector<CLScharr3x3>                   _func_scharr;
-    std::vector<CLTensor>                      _scharr_gx;
-    std::vector<CLTensor>                      _scharr_gy;
-    const ICLKeyPointArray                    *_old_points;
-    const ICLKeyPointArray                    *_new_points_estimates;
-    ICLKeyPointArray                          *_new_points;
-    std::unique_ptr<CLLKInternalKeypointArray> _old_points_internal;
-    std::unique_ptr<CLLKInternalKeypointArray> _new_points_internal;
-    std::unique_ptr<CLCoefficientTableArray>   _coefficient_table;
-    std::unique_ptr<CLOldValueArray>           _old_values;
-    size_t                                     _num_levels;
-};
-}
-#endif /*ARM_COMPUTE_CLOPTICALFLOW_H */
diff --git a/arm_compute/runtime/CL/functions/CLPReluLayer.h b/arm_compute/runtime/CL/functions/CLPReluLayer.h
index 74fa86a320..b123d0e46e 100644
--- a/arm_compute/runtime/CL/functions/CLPReluLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPReluLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,22 +24,46 @@
 #ifndef ARM_COMPUTE_CLPRELULAYER_H
 #define ARM_COMPUTE_CLPRELULAYER_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
-
-/** Basic function to run @ref CLArithmeticOperationKernel for PRELU
+class ITensorInfo;
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
-class CLPReluLayer : public ICLSimpleFunction
+class CLPReluLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLPReluLayer();
+    /** Default Destructor */
+    ~CLPReluLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPReluLayer(const CLPReluLayer &) = delete;
+    /** Default move constructor */
+    CLPReluLayer(CLPReluLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPReluLayer &operator=(const CLPReluLayer &) = delete;
+    /** Default move assignment operator */
+    CLPReluLayer &operator=(CLPReluLayer &&);
     /** Set the input and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -66,6 +90,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPRELULAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPadLayer.h b/arm_compute/runtime/CL/functions/CLPadLayer.h
index 82d7205381..89e693bd92 100644
--- a/arm_compute/runtime/CL/functions/CLPadLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPadLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,22 @@
 #ifndef ARM_COMPUTE_CLPADLAYER_H
 #define ARM_COMPUTE_CLPADLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLPermute.h"
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLPadLayerKernel;
 class ICLTensor;
 
 /** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
  *
  *  -# @ref CLPadLayerKernel if there is padding to be added
- *  -# @ref CLCopyKernel otherwise
+ *  -# @ref CLCopy otherwise
  */
 class CLPadLayer : public IFunction
 {
@@ -51,9 +54,20 @@ public:
     CLPadLayer &operator=(const CLPadLayer &) = delete;
     /** Default move assignment operator */
     CLPadLayer &operator=(CLPadLayer &&) = default;
+    /** Default destructor */
+    ~CLPadLayer();
 
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |All      |All       |
+     *
      * @param[in]  input          Source tensor. Data types supported: All.
      * @param[out] output         Output tensor. Data type supported: same as @p input
      * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
@@ -62,7 +76,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ICLTensor         *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Initialize the function
      *
      * @param[in]  compile_context The compile context to be used.
@@ -74,8 +92,12 @@ public:
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
 
     /**  Static function to check if given info will lead to a valid configuration of @ref CLPadLayer.
      *
@@ -87,7 +109,11 @@ public:
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
@@ -95,9 +121,9 @@ public:
 private:
     void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
 
-    CLPadLayerKernel _pad_kernel;
-    CLCopyKernel     _copy_kernel;
-    bool             _perform_pad;
+    std::unique_ptr<CLPadLayerKernel> _pad_kernel;
+    CLCopy                            _copy;
+    bool                              _perform_pad;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_PADLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPermute.h b/arm_compute/runtime/CL/functions/CLPermute.h
index 37e651cfbb..7ac0bf6b9c 100644
--- a/arm_compute/runtime/CL/functions/CLPermute.h
+++ b/arm_compute/runtime/CL/functions/CLPermute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,20 +25,43 @@
 #define ARM_COMPUTE_CLPERMUTE_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to execute an @ref CLPermuteKernel. */
-class CLPermute : public ICLSimpleFunction
+/** Basic function to execute an @ref opencl::kernels::ClPermuteKernel. */
+class CLPermute : public IFunction
 {
 public:
+    /** Constructor */
+    CLPermute();
+    /** Destructor */
+    ~CLPermute();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPermute(const CLPermute &) = delete;
+    /** Default move constructor */
+    CLPermute(CLPermute &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPermute &operator=(const CLPermute &) = delete;
+    /** Default move assignment operator */
+    CLPermute &operator=(CLPermute &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
      *
      * @param[in] input  The input tensor to permute. Data types supported: All.
@@ -55,7 +78,10 @@ public:
      * @param[in] output          The output tensor. Data types supported: Same as @p input
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input,
+                   ICLTensor               *output,
+                   const PermutationVector &perm);
     /**  Static function to check if given info will lead to a valid configuration of @ref CLPermute.
      *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
@@ -67,6 +93,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLPERMUTE_H */
diff --git a/arm_compute/runtime/CL/functions/CLPhase.h b/arm_compute/runtime/CL/functions/CLPhase.h
deleted file mode 100644
index f993906fe2..0000000000
--- a/arm_compute/runtime/CL/functions/CLPhase.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLPHASE_H
-#define ARM_COMPUTE_CLPHASE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute an @ref CLMagnitudePhaseKernel. */
-class CLPhase : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  input1     First tensor input. Data types supported: S16.
-     * @param[in]  input2     Second tensor input. Data types supported: S16.
-     * @param[out] output     Output tensor. Data types supported: U8.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED);
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input1          First tensor input. Data types supported: S16.
-     * @param[in]  input2          Second tensor input. Data types supported: S16.
-     * @param[out] output          Output tensor. Data types supported: U8.
-     * @param[in]  phase_type      (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, PhaseType phase_type = PhaseType::SIGNED);
-};
-}
-#endif /*ARM_COMPUTE_CLPHASE_H */
diff --git a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
index 8b0ee70f12..f3e5cf9bd3 100644
--- a/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
+++ b/arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,70 +24,100 @@
 #ifndef ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
 #define ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H
 
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Rounding.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 // Forward declaration
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLPixelWiseMultiplicationKernel. */
-class CLPixelWiseMultiplication : public ICLSimpleFunction
+/** Basic function to run @ref opencl::ClMul. */
+class CLPixelWiseMultiplication : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLPixelWiseMultiplication();
+    /** Default Destructor */
+    ~CLPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPixelWiseMultiplication(const CLPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    CLPixelWiseMultiplication(CLPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPixelWiseMultiplication &operator=(const CLPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    CLPixelWiseMultiplication &operator=(CLPixelWiseMultiplication &&);
     /** Initialise the kernel's inputs, output and convertion policy.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     * |S32            |S32            |S32            |
+     *
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+     * @param[in, out] input2          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          The output tensor. Data types supported:
-     *                                 - U8, only if both input are U8
-     *                                 - QASYMM8, only if both inputs are QASYMM8
-     *                                 - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                                 - S16
-     *                                 - QSYMM16, only if both inputs are QSYMM16
-     *                                 - S32, only if both inputs are QSYMM16
-     *                                 - F16
-     *                                 - F32
+     * @param[out]     output          The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output and convertion policy.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
+     * @param[in, out] input2          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[out]     output          The output tensor, Data types supported: same as @p input1. Note: U8 requires both inputs to be U8.
+     * @param[out]     output          The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLPixelWiseMultiplication
      *
      * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
-     * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-     * @param[in] output          The output tensor info. Data types supported:
-     *                            - U8, only if both input are U8
-     *                            - QASYMM8, only if both inputs are QASYMM8
-     *                            - QASYMM8_SIGNED, only if both inputs are QASYMM8_SIGNED
-     *                            - S16
-     *                            - QSYMM16, only if both inputs are QSYMM16
-     *                            - S32, only if both inputs are QSYMM16
-     *                            - F16
-     *                            - F32
+     * @param[in] input2          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in] output          The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
@@ -96,43 +126,84 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref CLComplexPixelWiseMultiplicationKernel. */
-class CLComplexPixelWiseMultiplication : public ICLSimpleFunction
+/** Basic function to run @ref opencl::ClComplexMul. */
+class CLComplexPixelWiseMultiplication : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLComplexPixelWiseMultiplication();
+    /** Default Destructor */
+    ~CLComplexPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComplexPixelWiseMultiplication(const CLComplexPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLComplexPixelWiseMultiplication &operator=(const CLComplexPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    CLComplexPixelWiseMultiplication &operator=(CLComplexPixelWiseMultiplication &&);
     /** Initialise the kernel's inputs, output.
      *
-     * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2.
+     * @param[in, out] input1   An input tensor. Data types supported: F16/F32. Number of channels supported: 2.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2   An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      *                          The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output   The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Initialise the kernel's inputs, output.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input1          An input tensor. Data types supported: F32. Number of channels supported: 2.
+     * @param[in, out] input1          An input tensor. Data types supported: F16/F32. Number of channels supported: 2.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[in, out] input2          An input tensor. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      *                                 The input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          The output tensor, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input1,
+                   ICLTensor                 *input2,
+                   ICLTensor                 *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLComplexPixelWiseMultiplication
      *
-     * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in] input1   An input tensor info. Data types supported: F16/F32. Number of channels supported: 2.
      * @param[in] input2   An input tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] output   The output tensor info, Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLPIXELWISEMULTIPLICATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLPooling3dLayer.h b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
new file mode 100644
index 0000000000..1c69148771
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLPooling3dLayer.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPOOLING3DLAYER_H
+#define ARM_COMPUTE_CLPOOLING3DLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ICLTensor;
+class ITensorInfo;
+
+/** Basic function to run  @ref opencl::ClPool3d */
+class CLPooling3dLayer : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLPooling3dLayer();
+    /** Default Destructor */
+    ~CLPooling3dLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPooling3dLayer(const CLPooling3dLayer &) = delete;
+    /** Default move constructor */
+    CLPooling3dLayer(CLPooling3dLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPooling3dLayer &operator=(const CLPooling3dLayer &) = delete;
+    /** Default move assignment operator */
+    CLPooling3dLayer &operator=(CLPooling3dLayer &&) = default;
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     *
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *       Cases where pooling region is completely outside input tensor are not supported
+     *
+     * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
+     *
+     * @param[in,out] input     Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in]     pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info);
+    /** Set the input and output tensors.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] input           Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[out]    output          Destination tensor. Data types supported: Same as @p input.
+     * @param[in]     pool_info       Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *input,
+                   ICLTensor                *output,
+                   const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLPooling3dLayer
+     *
+     * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains 3d pooling operation information described in @ref Pooling3dLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPoolingLayer.h b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
index 7d646ab268..3dbdf8aeea 100644
--- a/arm_compute/runtime/CL/functions/CLPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,31 +24,57 @@
 #ifndef ARM_COMPUTE_CLPOOLINGLAYER_H
 #define ARM_COMPUTE_CLPOOLINGLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
- * -# @ref CLPoolingLayerKernel
- */
-class CLPoolingLayer : public ICLSimpleFunction
+/** Basic function to run  @ref opencl::ClPool2d */
+class CLPoolingLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLPoolingLayer();
+    /** Default Destructor */
+    ~CLPoolingLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayer(const CLPoolingLayer &) = delete;
+    /** Default move constructor */
+    CLPoolingLayer(CLPoolingLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLPoolingLayer &operator=(const CLPoolingLayer &) = delete;
+    /** Default move assignment operator */
+    CLPoolingLayer &operator=(CLPoolingLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *       Cases where pooling region is completely outside input tensor are not supported
+     *
      * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]    indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    void
+    configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
     /** Set the input and output tensors.
      *
      * @param[in]     compile_context The compile context to be used.
@@ -57,7 +83,11 @@ public:
      * @param[in]     pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]    indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info,
+                   ICLTensor              *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPoolingLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -67,7 +97,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
index d39e4112f9..4ede906baa 100644
--- a/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLPriorBoxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,16 @@
 #ifndef ARM_COMPUTE_CLPRIORBOXLAYER_H
 #define ARM_COMPUTE_CLPRIORBOXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLPriorBoxLayerKernel.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLPriorBoxLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLPriorBoxLayerKernel. */
 class CLPriorBoxLayer : public ICLSimpleFunction
@@ -40,6 +43,15 @@ public:
     CLPriorBoxLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |dst      |
+     * |:--------|:--------|:--------|
+     * |F32      |F32      |F32      |
+     *
      * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
      * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
      * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
@@ -54,7 +66,11 @@ public:
      * @param[out] output          Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data types and layouts supported: same as @p input1
      * @param[in]  info            Prior box layer info.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayer
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -64,12 +80,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
 private:
     cl::Buffer _min;
     cl::Buffer _max;
     cl::Buffer _aspect_ratios;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
index 67e8bc7751..3e76da086f 100644
--- a/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,35 +24,42 @@
 #ifndef ARM_COMPUTE_CLQLSTMLAYER_H
 #define ARM_COMPUTE_CLQLSTMLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
 #include "arm_compute/runtime/CL/functions/CLTranspose.h"
-
 #include "arm_compute/runtime/common/LSTMParams.h"
 
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class CLQLSTMLayerNormalizationKernel;
+class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace opencl
 
 /** Basic function to run @ref CLQLSTMLayer
  *
  * This function calls the following CL functions/kernels:
  *
  * -# @ref CLActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref CLCopyKernel                                          Copy kernel for copying output_state_out to output
- * -# @ref CLSaturatedArithmeticOperationKernel                  Elementwise addition and subtraction
+ * -# @ref CLCopy                                                Copy function for copying output_state_out to output
+ * -# @ref CLArithmeticAddition                                  Elementwise addition and subtraction
  * -# @ref CLGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref CLGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref CLPixelWiseMultiplicationKernel                       Elementwise multiplication
+ * -# @ref CLGEMMLowpOutputStage   Convert 32-bit integers into QSYMM16
+ * -# @ref opencl::kernels::ClGemmLowpMatrixAReductionKernel                      For precomputing effective biases to use
+ * -# @ref CLPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref CLTranspose                                           Transpose function for reshaping the weights
  * */
 class CLQLSTMLayer : public IFunction
@@ -68,8 +75,18 @@ public:
     CLQLSTMLayer &operator=(const CLQLSTMLayer &) = delete;
     /** Default move assignment operator */
     CLQLSTMLayer &operator=(CLQLSTMLayer &&) = default;
+    /** Default destructor */
+    ~CLQLSTMLayer();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0          |src1 - src6  |src7 -src9   |src10  |src11         |dst0   |dst1 - dst2       |
+     * |:-------------|:------------|:------------|:------|:-------------|:------|:-----------------|
+     * |QASYMM8_SIGNED|QASYMM8      |S32          |QSYMM16|QASYMM8_SIGNED|QSYMM16|QASYMM8_SIGNED    |
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
      * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
@@ -109,12 +126,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+    void configure(const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *output_state_in,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
     /** Initialize function's tensors.
@@ -159,12 +185,22 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                   const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                   const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                   const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                   const ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                   ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+    void configure(const CLCompileContext      &compile_context,
+                   const ICLTensor             *input,
+                   const ICLTensor             *input_to_forget_weights,
+                   const ICLTensor             *input_to_cell_weights,
+                   const ICLTensor             *input_to_output_weights,
+                   const ICLTensor             *recurrent_to_forget_weights,
+                   const ICLTensor             *recurrent_to_cell_weights,
+                   const ICLTensor             *recurrent_to_output_weights,
+                   const ICLTensor             *forget_gate_bias,
+                   const ICLTensor             *cell_bias,
+                   const ICLTensor             *output_gate_bias,
+                   ICLTensor                   *cell_state_in,
+                   ICLTensor                   *output_state_in,
+                   ICLTensor                   *cell_state_out,
+                   ICLTensor                   *output_state_out,
+                   ICLTensor                   *output,
                    const LSTMParams<ICLTensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayer
@@ -209,12 +245,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -248,10 +293,18 @@ private:
      * @param[in] mm_res_info     Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias, CLTensor *mm_res,
-                      CLTensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(const CLCompileContext       &compile_context,
+                      CLGEMMLowpMatrixMultiplyCore &mm,
+                      CLGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ICLTensor              *mm_input,
+                      const ICLTensor              *mm_weights,
+                      const ICLTensor              *bias,
+                      CLTensor                     *mm_res,
+                      CLTensor                     *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
     MemoryGroup _memory_group{};
 
@@ -260,8 +313,8 @@ private:
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ICLTensor *_src{ nullptr };
-        ICLTensor *_dst{ nullptr };
+        ICLTensor *_src{nullptr};
+        ICLTensor *_dst{nullptr};
         size_t     _row_size{};
         Window     _window{};
 
@@ -285,72 +338,72 @@ private:
     };
 
     // Functions used
-    CLTranspose                          _transpose_input_to_forget_weights{};
-    CLTranspose                          _transpose_input_to_cell_weights{};
-    CLTranspose                          _transpose_input_to_output_weights{};
-    CLTranspose                          _transpose_input_to_input_weights{};
-    CLTranspose                          _transpose_recurrent_to_forget_weights{};
-    CLTranspose                          _transpose_recurrent_to_cell_weights{};
-    CLTranspose                          _transpose_recurrent_to_output_weights{};
-    CLTranspose                          _transpose_recurrent_to_input_weights{};
-    CLTranspose                          _transpose_projection_weights{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_input_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_forget_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_cell_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _input_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _recurrent_to_output_reduction{};
-    CLGEMMLowpMatrixAReductionKernel     _projection_reduction{};
-    CLSaturatedArithmeticOperationKernel _projection_bias_add{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_forget{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_forget{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_forget{};
-    CLGEMMLowpOutputStage                _input_to_forget_outstage{};
-    CLGEMMLowpOutputStage                _recurrent_to_forget_outstage{};
-    CLGEMMLowpOutputStage                _cell_to_forget_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_forget{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_forget{};
-    CLActivationLayer                    _forget_gate_sigmoid{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_cell{};
-    CLGEMMLowpOutputStage                _input_to_cell_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_cell{};
-    CLGEMMLowpOutputStage                _recurrent_to_cell_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_modulation{};
-    CLActivationLayer                    _cell_gate_tanh{};
-    CLSaturatedArithmeticOperationKernel _input_gate_sub{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_input{};
-    CLGEMMLowpOutputStage                _input_to_input_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_input{};
-    CLGEMMLowpOutputStage                _recurrent_to_input_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_input{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_input{};
-    CLGEMMLowpOutputStage                _cell_to_input_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_input{};
-    CLActivationLayer                    _input_gate_sigmoid{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_forget_cell{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_input_cell{};
-    CLSaturatedArithmeticOperationKernel _add_forget_cell{};
-    CLActivationLayer                    _cell_clip{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_input_to_output{};
-    CLGEMMLowpOutputStage                _input_to_output_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_recurrent_to_output{};
-    CLGEMMLowpOutputStage                _recurrent_to_output_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_input_recurrent_output{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_cell_to_output{};
-    CLGEMMLowpOutputStage                _cell_to_output_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_cell_to_output{};
-    CLActivationLayer                    _output_gate_sigmoid{};
-    CLActivationLayer                    _hidden_tanh{};
-    CLPixelWiseMultiplicationKernel      _pixelwise_mul_hidden{};
-    CLGEMMLowpOutputStage                _hidden_outstage{};
-    CLGEMMLowpMatrixMultiplyCore         _mm_projection{};
-    CLGEMMLowpOutputStage                _projection_outstage{};
-    CLSaturatedArithmeticOperationKernel _accumulate_projection{};
-    CLActivationLayer                    _projection_clip{};
-    std::array<CLQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
-    CLCopyKernel _copy_output{};
+    CLTranspose                                                        _transpose_input_to_forget_weights{};
+    CLTranspose                                                        _transpose_input_to_cell_weights{};
+    CLTranspose                                                        _transpose_input_to_output_weights{};
+    CLTranspose                                                        _transpose_input_to_input_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_forget_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_cell_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_output_weights{};
+    CLTranspose                                                        _transpose_recurrent_to_input_weights{};
+    CLTranspose                                                        _transpose_projection_weights{};
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<opencl::kernels::ClGemmLowpMatrixAReductionKernel> _projection_reduction;
+    CLArithmeticAddition                                               _projection_bias_add{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_forget{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_forget{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_cell_to_forget{};
+    CLGEMMLowpOutputStage                                              _input_to_forget_outstage{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_forget_outstage{};
+    CLGEMMLowpOutputStage                                              _cell_to_forget_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_forget{};
+    CLArithmeticAddition                                               _accumulate_cell_forget{};
+    CLActivationLayer                                                  _forget_gate_sigmoid{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_cell{};
+    CLGEMMLowpOutputStage                                              _input_to_cell_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_cell{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_cell_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_modulation{};
+    CLActivationLayer                                                  _cell_gate_tanh{};
+    CLArithmeticSubtraction                                            _input_gate_sub{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_input{};
+    CLGEMMLowpOutputStage                                              _input_to_input_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_input{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_input_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_input{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_cell_to_input{};
+    CLGEMMLowpOutputStage                                              _cell_to_input_outstage{};
+    CLArithmeticAddition                                               _accumulate_cell_input{};
+    CLActivationLayer                                                  _input_gate_sigmoid{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_forget_cell{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_input_cell{};
+    CLArithmeticAddition                                               _add_forget_cell{};
+    CLActivationLayer                                                  _cell_clip{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_input_to_output{};
+    CLGEMMLowpOutputStage                                              _input_to_output_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_recurrent_to_output{};
+    CLGEMMLowpOutputStage                                              _recurrent_to_output_outstage{};
+    CLArithmeticAddition                                               _accumulate_input_recurrent_output{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_cell_to_output{};
+    CLGEMMLowpOutputStage                                              _cell_to_output_outstage{};
+    CLArithmeticAddition                                               _accumulate_cell_to_output{};
+    CLActivationLayer                                                  _output_gate_sigmoid{};
+    CLActivationLayer                                                  _hidden_tanh{};
+    CLPixelWiseMultiplication                                          _pixelwise_mul_hidden{};
+    CLGEMMLowpOutputStage                                              _hidden_outstage{};
+    CLGEMMLowpMatrixMultiplyCore                                       _mm_projection{};
+    CLGEMMLowpOutputStage                                              _projection_outstage{};
+    CLArithmeticAddition                                               _accumulate_projection{};
+    CLActivationLayer                                                  _projection_clip{};
+    std::array<std::unique_ptr<CLQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
+    CLCopy                                                                          _copy_output;
 
     TensorCopyKernel _projection_bias_copy{};
     TensorCopyKernel _projection_output_to_accumulate_copy{};
@@ -358,18 +411,18 @@ private:
     TensorCopyKernel _hidden_to_output_copy{};
 
     // Tensor pointers
-    const ICLTensor *_input_to_input_weights{ nullptr };
-    const ICLTensor *_recurrent_to_input_weights{ nullptr };
-    const ICLTensor *_projection_bias{ nullptr };
-    const ICLTensor *_input_to_forget_weights{ nullptr };
-    const ICLTensor *_input_to_cell_weights{ nullptr };
-    const ICLTensor *_input_to_output_weights{ nullptr };
-    const ICLTensor *_recurrent_to_forget_weights{ nullptr };
-    const ICLTensor *_recurrent_to_cell_weights{ nullptr };
-    const ICLTensor *_recurrent_to_output_weights{ nullptr };
-    const ICLTensor *_projection_weights{ nullptr };
-    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    const ICLTensor                                 *_input_to_input_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_input_weights{nullptr};
+    const ICLTensor                                 *_projection_bias{nullptr};
+    const ICLTensor                                 *_input_to_forget_weights{nullptr};
+    const ICLTensor                                 *_input_to_cell_weights{nullptr};
+    const ICLTensor                                 *_input_to_output_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ICLTensor                                 *_recurrent_to_output_weights{nullptr};
+    const ICLTensor                                 *_projection_weights{nullptr};
+    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_weights{{}};
+    std::array<const ICLTensor *, _layer_norm_count> _layer_norm_bias{{}};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -399,100 +452,81 @@ private:
 
     inline CLQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
     {
-        return _layer_norms[getGateIndex(g)];
-    }
-
-    inline void configure_layer_norm(LayerNormGate g, const ICLTensor *in)
-    {
-        ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
-
-        CLTensor *out = &get_layer_norm_output(g);
-        _memory_group.manage(out);
-        out->allocator()->init(*(in->info()));
-
-        get_layer_norm(g).configure(in, out, get_layer_norm_weight(g), get_layer_norm_bias(g));
+        return *_layer_norms[getGateIndex(g)];
     }
 
-    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
-    {
-        // Output quantization scale will be different, but ignored here
-        // since it will be configured at configure() stage.
-        const TensorInfo out
-        {
-            in
-        };
-        return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
-    }
+    inline void          configure_layer_norm(LayerNormGate g, const ICLTensor *in);
+    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    CLTensor _input_to_forget_weights_transposed{ nullptr };
-    CLTensor _input_to_cell_weights_transposed{ nullptr };
-    CLTensor _input_to_output_weights_transposed{ nullptr };
-    CLTensor _input_to_input_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_forget_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_cell_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_output_weights_transposed{ nullptr };
-    CLTensor _recurrent_to_input_weights_transposed{ nullptr };
-    CLTensor _projection_weights_transposed{ nullptr };
-    CLTensor _input_to_input_eff_bias{ nullptr };
-    CLTensor _recurrent_to_input_eff_bias{ nullptr };
-    CLTensor _input_to_forget_eff_bias{ nullptr };
-    CLTensor _recurrent_to_forget_eff_bias{ nullptr };
-    CLTensor _input_to_cell_eff_bias{ nullptr };
-    CLTensor _recurrent_to_cell_eff_bias{ nullptr };
-    CLTensor _input_to_output_eff_bias{ nullptr };
-    CLTensor _recurrent_to_output_eff_bias{ nullptr };
-    CLTensor _projection_reduction_res{ nullptr };
-    CLTensor _projection_eff_bias{ nullptr };
-    CLTensor _mm_input_to_forget_res{ nullptr };
-    CLTensor _mm_recurrent_to_forget_res{ nullptr };
-    CLTensor _mul_cell_to_forget_res{ nullptr };
-    CLTensor _input_to_forget_outstage_res{ nullptr };
-    CLTensor _cell_to_forget_outstage_res{ nullptr };
-    CLTensor _recurrent_to_forget_outstage_res{ nullptr };
-    CLTensor _forget_gate{ nullptr };
-    CLTensor _mm_input_to_cell_res{ nullptr };
-    CLTensor _input_to_cell_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_cell_res{ nullptr };
-    CLTensor _recurrent_to_cell_outstage_res{ nullptr };
-    CLTensor _cell_gate{ nullptr };
-    CLTensor _mul_input_cell_res{ nullptr };
-    CLTensor _mm_input_to_input_res{ nullptr };
-    CLTensor _input_to_input_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_input_res{ nullptr };
-    CLTensor _mul_cell_to_input_res{ nullptr };
-    CLTensor _cell_to_input_outstage_res{ nullptr };
-    CLTensor _recurrent_to_input_outstage_res{ nullptr };
-    CLTensor _input_gate{ nullptr };
-    CLTensor _mm_input_to_output_res{ nullptr };
-    CLTensor _input_to_output_outstage_res{ nullptr };
-    CLTensor _mm_recurrent_to_output_res{ nullptr };
-    CLTensor _mul_cell_to_output_res{ nullptr };
-    CLTensor _cell_to_output_outstage_res{ nullptr };
-    CLTensor _recurrent_to_output_outstage_res{ nullptr };
-    CLTensor _output_gate{ nullptr };
-    CLTensor _hidden_mul_res{ nullptr };
-    CLTensor _hidden_gate{ nullptr };
-    CLTensor _mm_projection_res{ nullptr };
-    CLTensor _projection_outstage_res{ nullptr };
-    CLTensor _projection_out_res{ nullptr };
-    CLTensor _projection_accumulate_res{ nullptr };
-    CLTensor _ones{ nullptr };
-    std::array<CLTensor, _layer_norm_count> _layer_norm_output{ {} };
+    CLTensor                                _input_to_forget_weights_transposed{nullptr};
+    CLTensor                                _input_to_cell_weights_transposed{nullptr};
+    CLTensor                                _input_to_output_weights_transposed{nullptr};
+    CLTensor                                _input_to_input_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_output_weights_transposed{nullptr};
+    CLTensor                                _recurrent_to_input_weights_transposed{nullptr};
+    CLTensor                                _projection_weights_transposed{nullptr};
+    CLTensor                                _input_to_input_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_input_eff_bias{nullptr};
+    CLTensor                                _input_to_forget_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_forget_eff_bias{nullptr};
+    CLTensor                                _input_to_cell_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_cell_eff_bias{nullptr};
+    CLTensor                                _input_to_output_eff_bias{nullptr};
+    CLTensor                                _recurrent_to_output_eff_bias{nullptr};
+    CLTensor                                _projection_reduction_res{nullptr};
+    CLTensor                                _projection_eff_bias{nullptr};
+    CLTensor                                _mm_input_to_forget_res{nullptr};
+    CLTensor                                _mm_recurrent_to_forget_res{nullptr};
+    CLTensor                                _mul_cell_to_forget_res{nullptr};
+    CLTensor                                _input_to_forget_outstage_res{nullptr};
+    CLTensor                                _cell_to_forget_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_forget_outstage_res{nullptr};
+    CLTensor                                _forget_gate{nullptr};
+    CLTensor                                _mm_input_to_cell_res{nullptr};
+    CLTensor                                _input_to_cell_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_cell_res{nullptr};
+    CLTensor                                _recurrent_to_cell_outstage_res{nullptr};
+    CLTensor                                _cell_gate{nullptr};
+    CLTensor                                _mul_input_cell_res{nullptr};
+    CLTensor                                _mm_input_to_input_res{nullptr};
+    CLTensor                                _input_to_input_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_input_res{nullptr};
+    CLTensor                                _mul_cell_to_input_res{nullptr};
+    CLTensor                                _cell_to_input_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_input_outstage_res{nullptr};
+    CLTensor                                _input_gate{nullptr};
+    CLTensor                                _mm_input_to_output_res{nullptr};
+    CLTensor                                _input_to_output_outstage_res{nullptr};
+    CLTensor                                _mm_recurrent_to_output_res{nullptr};
+    CLTensor                                _mul_cell_to_output_res{nullptr};
+    CLTensor                                _cell_to_output_outstage_res{nullptr};
+    CLTensor                                _recurrent_to_output_outstage_res{nullptr};
+    CLTensor                                _output_gate{nullptr};
+    CLTensor                                _hidden_mul_res{nullptr};
+    CLTensor                                _hidden_gate{nullptr};
+    CLTensor                                _mm_projection_res{nullptr};
+    CLTensor                                _projection_outstage_res{nullptr};
+    CLTensor                                _projection_out_res{nullptr};
+    CLTensor                                _projection_accumulate_res{nullptr};
+    CLTensor                                _ones{nullptr};
+    std::array<CLTensor, _layer_norm_count> _layer_norm_output{{}};
 
     inline CLTensor &get_layer_norm_output(LayerNormGate g)
     {
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLQLSTMLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
index f59e3b7919..6543496d93 100644
--- a/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
+++ b/arm_compute/runtime/CL/functions/CLQuantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,52 @@
 #ifndef ARM_COMPUTE_CLQUANTIZATIONLAYER_H
 #define ARM_COMPUTE_CLQUANTIZATIONLAYER_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to simulate a quantization layer. This function calls the following CL kernels:
  *
- * @note The implementation supports only 3D input tensors.
+ * -# @ref opencl::ClQuantize
  *
- * -# @ref CLQuantizationLayerKernel
+ * @note The implementation supports only 3D input tensors.
  *
  */
-class CLQuantizationLayer : public ICLSimpleFunction
+class CLQuantizationLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLQuantizationLayer();
+    /** Default Destructor */
+    ~CLQuantizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLQuantizationLayer(const CLQuantizationLayer &) = delete;
+    /** Default move constructor */
+    CLQuantizationLayer(CLQuantizationLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLQuantizationLayer &operator=(const CLQuantizationLayer &) = delete;
+    /** Default move assignment operator */
+    CLQuantizationLayer &operator=(CLQuantizationLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst                                |
+     * |:------------------|:----------------------------------|
+     * |QASYMM8            |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     * |QASYMM8_SIGNED     |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     * |F16                |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     * |F32                |QASYMM8, QASYMM8_SIGNED, QASYMM16  |
+     *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
      * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
      *
@@ -65,6 +93,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } //namespace arm_compute
 #endif /* ARM_COMPUTE_CLQUANTIZATIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRNNLayer.h b/arm_compute/runtime/CL/functions/CLRNNLayer.h
index 0291eb17a9..a8d835d04d 100644
--- a/arm_compute/runtime/CL/functions/CLRNNLayer.h
+++ b/arm_compute/runtime/CL/functions/CLRNNLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,14 @@
 #ifndef ARM_COMPUTE_CLRNN_LAYER_H
 #define ARM_COMPUTE_CLRNN_LAYER_H
 
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/functions/CLActivationLayer.h"
+#include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
 #include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
@@ -41,8 +43,24 @@ class CLRNNLayer : public IFunction
 public:
     /** Default constructor */
     CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied */
+    CLRNNLayer(const CLRNNLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLRNNLayer &operator=(const CLRNNLayer &) = delete;
+    /** Default destructor */
+    ~CLRNNLayer();
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |src3   |dst0   |dst1   |
+     * |:------|:------|:------|:------|:------|:------|
+     * |F16    |F16    |F16    |F16    |F16    |F16    |
+     * |F32    |F32    |F32    |F32    |F32    |F32    |
+     *
      * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
      * @param[in]     weights           Weights tensor of shape [input_size, num_units] that multiplies the input. Data types supported: Same as @p input
      * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the current 'state'. Data types supported: Same as @p input
@@ -51,7 +69,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *weights,
+                   const ICLTensor     *recurrent_weights,
+                   const ICLTensor     *bias,
+                   ICLTensor           *hidden_state,
+                   ICLTensor           *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in]     compile_context   The compile context to be used.
@@ -63,8 +87,14 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
-                   ICLTensor *output, ActivationLayerInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *weights,
+                   const ICLTensor        *recurrent_weights,
+                   const ICLTensor        *bias,
+                   ICLTensor              *hidden_state,
+                   ICLTensor              *output,
+                   ActivationLayerInfo    &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -77,7 +107,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
@@ -85,16 +120,16 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                          _memory_group;
-    CLGEMM                               _gemm_state_f;
-    CLSaturatedArithmeticOperationKernel _add_kernel;
-    CLActivationLayerKernel              _activation_kernel;
-    CLFullyConnectedLayer                _fully_connected_kernel;
-    CLCopyKernel                         _copy_kernel;
-    CLTensor                             _fully_connected_out;
-    CLTensor                             _gemm_output;
-    CLTensor                             _add_output;
-    bool                                 _is_prepared;
+    MemoryGroup           _memory_group;
+    CLGEMM                _gemm_state_f;
+    CLArithmeticAddition  _add_kernel;
+    CLActivationLayer     _activation;
+    CLFullyConnectedLayer _fully_connected_kernel;
+    CLCopy                _copy;
+    CLTensor              _fully_connected_out;
+    CLTensor              _gemm_output;
+    CLTensor              _add_output;
+    bool                  _is_prepared;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLRNN_LAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
index b6defe6c7f..14d3476711 100644
--- a/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIAlignLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,12 +25,14 @@
 #define ARM_COMPUTE_CLROIALIGNLAYER_H
 
 #include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ROIPoolingLayerInfo;
+class ITensorInfo;
 
 /** Basic function to run @ref CLROIAlignLayerKernel.
  *
@@ -43,6 +45,17 @@ class CLROIAlignLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     * |QASYMM8        |QASYMM16       |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM16       |QASYMM8_SIGNED |
+     *
      * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
@@ -55,7 +68,8 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +85,11 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayer
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -87,7 +105,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
index 0376e7847c..86294596d2 100644
--- a/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
+++ b/arm_compute/runtime/CL/functions/CLROIPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,15 @@
 #ifndef ARM_COMPUTE_CLROIPOOLINGLAYER_H
 #define ARM_COMPUTE_CLROIPOOLINGLAYER_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/ICLArray.h"
-#include "arm_compute/core/CL/kernels/CLROIPoolingLayerKernel.h"
-
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ROIPoolingLayerInfo;
+class ITensorInfo;
 
 /** Basic function to run @ref CLROIPoolingLayerKernel.
  *
@@ -44,7 +45,17 @@ class CLROIPoolingLayer : public ICLSimpleFunction
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |U16            |F16            |
+     * |F32            |U16            |F32            |
+     * |QASYMM8        |U16            |QASYMM8        |
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
@@ -55,11 +66,12 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: F16/F32.
+     * @param[in]  input           Source tensor. Data types supported: F16/F32/QASYMM8
      * @param[in]  rois            ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                             as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
@@ -70,7 +82,30 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLROIPoolingLayer
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] rois      ROIs tensor info. Data types supported: U16
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     *
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLROIPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLRange.h b/arm_compute/runtime/CL/functions/CLRange.h
index 19e11bacd4..ed665bc398 100644
--- a/arm_compute/runtime/CL/functions/CLRange.h
+++ b/arm_compute/runtime/CL/functions/CLRange.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLRangeKernel
  *
@@ -41,6 +43,22 @@ class CLRange : public ICLSimpleFunction
 public:
     /** Initialize the kernel's start, end, step and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |dst       |
+     * |:---------|
+     * |U8        |
+     * |S8        |
+     * |QASYMM8   |
+     * |U16       |
+     * |S16       |
+     * |U32       |
+     * |S32       |
+     * |F16       |
+     * |F32       |
+     *
      * @param[out] output Output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  start  The starting value of the sequence.
      * @param[in]  end    The ending (not including) value of the sequence.
@@ -55,7 +73,8 @@ public:
      * @param[in]  end             The ending (not including) value of the sequence.
      * @param[in]  step            The gap between each pair of values in the sequence. Default is 1.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *output, float start, float end, float step = 1.f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLRange
      *
      * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
diff --git a/arm_compute/runtime/CL/functions/CLReduceMean.h b/arm_compute/runtime/CL/functions/CLReduceMean.h
index 57ec48d690..640fe7cf1b 100644
--- a/arm_compute/runtime/CL/functions/CLReduceMean.h
+++ b/arm_compute/runtime/CL/functions/CLReduceMean.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,12 @@
 #ifndef ARM_COMPUTE_CL_REDUCE_MEAN_H
 #define ARM_COMPUTE_CL_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h"
+#include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
 namespace arm_compute
@@ -43,6 +45,17 @@ public:
     CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  input          Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -61,7 +74,11 @@ public:
      * @param[in]  keep_dims       If positive, retains reduced dimensions with length 1.
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   const Coordinates      &reduction_axis,
+                   bool                    keep_dims,
+                   ICLTensor              *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReduceMean
      *
@@ -72,7 +89,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
@@ -82,8 +100,13 @@ private:
     std::vector<CLReductionOperation> _reduction_kernels;
     std::vector<CLTensor>             _reduced_outs;
     CLReshapeLayer                    _reshape;
+    CLDequantizationLayer             _dequant;
+    CLQuantizationLayer               _requant;
     int                               _reduction_ops;
     bool                              _keep_dims;
+    bool                              _do_requant;
+    CLTensor                          _input_no_quant;
+    CLTensor                          _output_no_quant;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_REDUCE_MEAN_H */
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 25cf655802..80068ac35c 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,19 @@
 #ifndef ARM_COMPUTE_CLREDUCTIONOPERATION_H
 #define ARM_COMPUTE_CLREDUCTIONOPERATION_H
 
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
-#include <cstdint>
 #include <memory>
-#include <vector>
 
 namespace arm_compute
 {
+// Forward declarations
+class CLCompileContext;
+class CLReductionOperationKernel;
 class ICLTensor;
 
 /** Perform reduction operation.
@@ -51,30 +49,58 @@ public:
      * @param[in] memory_manager (Optional) Memory manager.
      */
     CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLReductionOperation();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReductionOperation(const CLReductionOperation &) = delete;
+    /** Default move constructor */
+    CLReductionOperation(CLReductionOperation &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReductionOperation &operator=(const CLReductionOperation &) = delete;
+    /** Default move assignment operator */
+    CLReductionOperation &operator=(CLReductionOperation &&) = default;
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[out] output    Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in]  op        Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    void
+    configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[out] output          Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      * @param[in]  keep_dims       (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op,
+                   bool                    keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32/S32.
      * @param[in] output    Destination tensor info. Data types and data layouts supported: Same as @p input.
      * @param[in] axis      Axis along which to reduce. Supported reduction axis : 0, 1, 2, 3
      * @param[in] op        Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
@@ -82,7 +108,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
@@ -90,16 +120,12 @@ public:
 private:
     ICLTensor *configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output);
 
-    MemoryGroup                             _memory_group;
-    std::vector<CLTensor>                   _results_vector;
-    std::vector<CLReductionOperationKernel> _reduction_kernels_vector;
-    std::vector<CLFillBorderKernel>         _border_handlers_vector;
-    CLReshapeLayerKernel                    _reshape_kernel;
-    ReductionOperation                      _op;
-    unsigned int                            _num_of_stages;
-    unsigned int                            _reduction_axis;
-    bool                                    _is_serial;
-    bool                                    _is_reshape_required;
+    MemoryGroup                                 _memory_group;
+    CLTensor                                    _unreshaped_output;
+    std::unique_ptr<CLReductionOperationKernel> _reduction_kernel;
+    CLReshapeLayer                              _reshape;
+    unsigned int                                _reduction_axis;
+    bool                                        _is_reshape_required;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
diff --git a/arm_compute/runtime/CL/functions/CLRemap.h b/arm_compute/runtime/CL/functions/CLRemap.h
deleted file mode 100644
index dc8a2c4ecf..0000000000
--- a/arm_compute/runtime/CL/functions/CLRemap.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLREMAP_H
-#define ARM_COMPUTE_CLREMAP_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute remap. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLRemapKernel
- */
-class CLRemap : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     map_x                 Map for X coords. Data types supported: F32.
-     * @param[in]     map_y                 Map for Y coords. Data types supported: F32.
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]     border_mode           Border mode to use on the input tensor.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]     map_x                 Map for X coords. Data types supported: F32.
-     * @param[in]     map_y                 Map for Y coords. Data types supported: F32.
-     * @param[out]    output                Output tensor. Data types supported: U8.
-     * @param[in]     policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]     border_mode           Border mode to use on the input tensor.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *map_x, const ICLTensor *map_y, ICLTensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLREMAP_H */
diff --git a/arm_compute/runtime/CL/functions/CLReorgLayer.h b/arm_compute/runtime/CL/functions/CLReorgLayer.h
index 8b245ab441..976b8f6213 100644
--- a/arm_compute/runtime/CL/functions/CLReorgLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReorgLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,13 +29,24 @@
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 class CLReorgLayer : public ICLSimpleFunction
 {
 public:
     /** Initialise the function's source and destination.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  Source tensor. Data types supported: All.
      * @param[out] output Destination tensor with tensor shape:
      *                    [width_input / stride, height_input / stride, channels_input * stride * stride, batch_size]. This means the output has
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index e91c2c739b..dad90e6ba9 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,43 @@
 #ifndef ARM_COMPUTE_CLRESHAPELAYER_H
 #define ARM_COMPUTE_CLRESHAPELAYER_H
 
+#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLReshapeLayerKernel */
-class CLReshapeLayer : public ICLSimpleFunction
+/** Basic function to run @ref opencl::kernels::ClReshapeKernel */
+class CLReshapeLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLReshapeLayer();
+    /** Default Destructor */
+    ~CLReshapeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReshapeLayer(const CLReshapeLayer &) = delete;
+    /** Default move constructor */
+    CLReshapeLayer(CLReshapeLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReshapeLayer &operator=(const CLReshapeLayer &) = delete;
+    /** Default move assignment operator */
+    CLReshapeLayer &operator=(CLReshapeLayer &&);
     /** Initialise the kernel's inputs and outputs
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  First tensor input. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
@@ -56,6 +81,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRESHAPELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLReverse.h b/arm_compute/runtime/CL/functions/CLReverse.h
index 87ae34c89d..46229540b4 100644
--- a/arm_compute/runtime/CL/functions/CLReverse.h
+++ b/arm_compute/runtime/CL/functions/CLReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,17 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLREVERSE_H
-#define ARM_COMPUTE_CLREVERSE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLReverseKernel */
 class CLReverse : public ICLSimpleFunction
@@ -36,28 +39,44 @@ class CLReverse : public ICLSimpleFunction
 public:
     /** Initialize the function
      *
-     * @param[in]  input  Input tensor. Data types supported: All.
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |All            |U32, S32       |All            |
+     *
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis);
     /** Initialize the function
      *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Input tensor. Data types supported: All.
-     * @param[out] output          Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in]  compile_context   The compile context to be used.
+     * @param[in]  input             Input tensor. Data types supported: All.
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis,
+                   bool                    use_inverted_axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All.
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All.
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLREVERSE_H
diff --git a/arm_compute/runtime/CL/functions/CLScale.h b/arm_compute/runtime/CL/functions/CLScale.h
index f345995138..5c3824eb58 100644
--- a/arm_compute/runtime/CL/functions/CLScale.h
+++ b/arm_compute/runtime/CL/functions/CLScale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,65 +24,87 @@
 #ifndef ARM_COMPUTE_CLSCALE_H
 #define ARM_COMPUTE_CLSCALE_H
 
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to run @ref CLScaleKernel */
-class CLScale : public ICLSimpleFunction
+/** Basic function to run  @ref opencl::ClScale */
+class CLScale : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLScale();
+    /** Default Destructor */
+    ~CLScale();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLScale(const CLScale &) = delete;
+    /** Default move constructor */
+    CLScale(CLScale &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLScale &operator=(const CLScale &) = delete;
+    /** Default move assignment operator */
+    CLScale &operator=(CLScale &&) = default;
+
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in,out] input                 Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: Same as @p input
-     *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]     use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]     align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |U8             |U8             |
+     * |S16            |S16            |
+     *
+     * @param[in,out] input  Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output Destination tensor. Data types supported: Same as @p input
+     *                       All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info   @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    void configure(ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info);
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: Same as @p input
-     *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]     use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]     align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] input           Source tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    output          Destination tensor. Data types supported: Same as @p input
+     *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ScaleKernelInfo  &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLScale
      *
-     * @param[in] input                 Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32.
-     * @param[in] output                Output tensor info. Data type supported: Same as @p input
-     *                                  All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy                The interpolation type.
-     * @param[in] border_mode           Strategy to use for borders.
-     * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
+     * @param[in] input  Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32.
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     *                   All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in] info   @ref ScaleKernelInfo descriptor to be used to validate
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                           SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_CLSCALE_H */
diff --git a/arm_compute/runtime/CL/functions/CLScatter.h b/arm_compute/runtime/CL/functions/CLScatter.h
new file mode 100644
index 0000000000..973953624e
--- /dev/null
+++ b/arm_compute/runtime/CL/functions/CLScatter.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSCATTER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSCATTER_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+class ITensorInfo;
+struct ScatterInfo;
+class CLCompileContext;
+
+/** Function to compute ScatterND Layer */
+class CLScatter : public IFunction
+{
+public:
+    /** Default Constructor */
+    CLScatter();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLScatter(const CLScatter &) = delete;
+    /** Default move constructor */
+    CLScatter(CLScatter &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLScatter &operator=(const CLScatter &) = delete;
+    /** Default move assignment operator */
+    CLScatter &operator=(CLScatter &&);
+    /** Default destructor */
+    ~CLScatter();
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @note Negative indices are treated as out of bounds.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Values used to fill output. Can be nullptr when zero initialization is true.
+     * @param[in]  updates         Tensor containing values used to update output tensor. Data types supported: same as @p src
+     * @param[in]  indices         Tensor containing Indices to change in the output Tensor. Data types supported : S32
+     * @param[out] output          Destination tensor. Data types supported: same as @p src.
+     * @param[in]  info            Scatter info object.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *src,
+                   const ICLTensor        *updates,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   const ScatterInfo      &info);
+    /** Initialise the kernel's inputs and output
+     *
+     * Similar to @ref CLScatter::configure()
+     */
+    void configure(const ICLTensor   *src,
+                   const ICLTensor   *updates,
+                   const ICLTensor   *indices,
+                   ICLTensor         *output,
+                   const ScatterInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLScatter
+     *
+     * @param[in] src     Source tensor.
+     * @param[in] updates Tensor containing values used for updating the output Tensor. Data types supported : same as @p src
+     * @param[in] indices Tensor containing Indices to change in the output Tensor. Data types supported : S32
+     * @param[in] output  Destination tensor. Data types supported: same as @p src.
+     * @param[in] info    Scatter info containing type of scatter.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *output,
+                           const ScatterInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSCATTER_H
diff --git a/arm_compute/runtime/CL/functions/CLScharr3x3.h b/arm_compute/runtime/CL/functions/CLScharr3x3.h
deleted file mode 100644
index b25b548eaa..0000000000
--- a/arm_compute/runtime/CL/functions/CLScharr3x3.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSCHARR3X3_H
-#define ARM_COMPUTE_CLSCHARR3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute scharr 3x3 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLScharr3x3Kernel
- *
- */
-class CLScharr3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLSCHARR3X3_H */
diff --git a/arm_compute/runtime/CL/functions/CLSelect.h b/arm_compute/runtime/CL/functions/CLSelect.h
index 84d0997149..effcb58313 100644
--- a/arm_compute/runtime/CL/functions/CLSelect.h
+++ b/arm_compute/runtime/CL/functions/CLSelect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,15 @@
 #ifndef ARM_COMPUTE_CLSELECT_H
 #define ARM_COMPUTE_CLSELECT_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
 // Forward declarations
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLSelect */
 class CLSelect : public ICLSimpleFunction
@@ -39,6 +40,14 @@ class CLSelect : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |U8             |All            |All    |All            |
+     *
      * @param[in]  c      Condition input tensor. Data types supported: U8.
      * @param[in]  x      First input tensor. Data types supported: All.
      * @param[in]  y      Second input tensor. Data types supported: Same as @p x
@@ -53,7 +62,11 @@ public:
      * @param[in]  y               Second input tensor. Data types supported: Same as @p x
      * @param[out] output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelect
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.
diff --git a/arm_compute/runtime/CL/functions/CLSlice.h b/arm_compute/runtime/CL/functions/CLSlice.h
index a8c6e1ff0b..7a274ded72 100644
--- a/arm_compute/runtime/CL/functions/CLSlice.h
+++ b/arm_compute/runtime/CL/functions/CLSlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,42 @@
 #ifndef ARM_COMPUTE_CL_SLICE_H
 #define ARM_COMPUTE_CL_SLICE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
 // Forward Declarations
 class ICLTensor;
+class CLCompileContext;
+class ITensorInfo;
 
 /** Basic function to perform tensor slicing */
-class CLSlice : public ICLSimpleFunction
+class CLSlice : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLSlice();
+    /** Default Destructor */
+    ~CLSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSlice(const CLSlice &) = delete;
+    /** Default move constructor */
+    CLSlice(CLSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLSlice &operator=(const CLSlice &) = delete;
+    /** Default move assignment operator */
+    CLSlice &operator=(CLSlice &&);
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @note Supported tensor rank: up to 4
      * @note Start indices must be non-negative. 0 <= starts[i]
      * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
@@ -61,7 +84,11 @@ public:
      * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
      *
@@ -77,7 +104,59 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+/** Basic function to perform tensor slicing */
+class CLSlice : public ICLOperator
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor info. Data type supported: All.
+     * @param[out] output          Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts          The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends            The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLSlice
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     *
+     * @return A status
+     */
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_SLICE_H */
diff --git a/arm_compute/runtime/CL/functions/CLSobel3x3.h b/arm_compute/runtime/CL/functions/CLSobel3x3.h
deleted file mode 100644
index 24bc0cda43..0000000000
--- a/arm_compute/runtime/CL/functions/CLSobel3x3.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL3X3_H
-#define ARM_COMPUTE_CLSOBEL3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute sobel 3x3 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLSobel3x3Kernel
- *
- */
-class CLSobel3x3 : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLSOBEL3X3_H */
diff --git a/arm_compute/runtime/CL/functions/CLSobel5x5.h b/arm_compute/runtime/CL/functions/CLSobel5x5.h
deleted file mode 100644
index bf266270c3..0000000000
--- a/arm_compute/runtime/CL/functions/CLSobel5x5.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL5X5_H
-#define ARM_COMPUTE_CLSOBEL5X5_H
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel5x5Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute sobel 5x5 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLSobel5x5HorKernel
- * -# @ref CLSobel5x5VertKernel
- *
- */
-class CLSobel5x5 : public IFunction
-{
-public:
-    /** Default Constructor.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    CLSobel5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data types supported: S16.
-     * @param[out]    output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data types supported: S16.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup          _memory_group;   /**< Function's memory group */
-    CLSobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
-    CLSobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
-    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
-    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
-    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
-};
-}
-#endif /*ARM_COMPUTE_CLSOBEL5X5_H */
diff --git a/arm_compute/runtime/CL/functions/CLSobel7x7.h b/arm_compute/runtime/CL/functions/CLSobel7x7.h
deleted file mode 100644
index 13932c704a..0000000000
--- a/arm_compute/runtime/CL/functions/CLSobel7x7.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLSOBEL7X7_H
-#define ARM_COMPUTE_CLSOBEL7X7_H
-
-#include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
-#include "arm_compute/core/CL/kernels/CLSobel7x7Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute sobel 7x7 filter. This function calls the following OpenCL kernels:
- *
- * -# @ref CLFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref CLSobel7x7HorKernel
- * -# @ref CLSobel7x7VertKernel
- *
- */
-class CLSobel7x7 : public IFunction
-{
-public:
-    /** Default Constructor.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    CLSobel7x7(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32.
-     * @param[out]    output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data types supported: S32.
-     * @param[out]    output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data types supported: S32.
-     * @param[in]     border_mode           Border mode to use for the convolution.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output_x, ICLTensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup          _memory_group;   /**< Function's memory group */
-    CLSobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
-    CLSobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
-    CLFillBorderKernel   _border_handler; /**< Kernel to handle image borders */
-    CLImage              _tmp_x;          /**< Temporary buffer for Sobel X */
-    CLImage              _tmp_y;          /**< Temporary buffer for Sobel Y */
-};
-}
-#endif /*ARM_COMPUTE_CLSOBEL7X7_H */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index fadbc430e6..68541e35c5 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
-#define ARM_COMPUTE_CLSOFTMAXLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -37,6 +33,8 @@
 namespace arm_compute
 {
 class ICLTensor;
+class ITensorInfo;
+class CLCompileContext;
 
 /** Basic function to compute a SoftmaxLayer.
  *
@@ -44,12 +42,7 @@ class ICLTensor;
  * @f[ out = exp((x - max(x)) * beta) / sum(exp((x - max(x)) * beta)) @f]
  *
  * Log Softmax is calculated by :
- * @f[ out = (x - max(x) * beta) - \sum{e^{x - max(x) * beta}} @f]
- *
- * This function runs the following kernels:
- * -# @ref CLLogits1DMaxKernel
- * -# @ref CLLogits1DShiftExpSumKernel
- * -# @ref CLLogits1DNormKernel
+ * @f[ out = (x - max(x) * beta) - log(\sum{e^{x - max(x) * beta}}) @f]
  */
 template <bool IS_LOG = false>
 class CLSoftmaxLayerGeneric : public IFunction
@@ -57,87 +50,63 @@ class CLSoftmaxLayerGeneric : public IFunction
 public:
     /** Constructor */
     CLSoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default destructor */
+    ~CLSoftmaxLayerGeneric();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output Destination tensor. Data types supported: same as @p input
      * @param[in]  beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]  axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1);
+    void configure(const ICLTensor *input, ICLTensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[out] output          Destination tensor. Data types supported: same as @p input
      * @param[in]  beta            (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in]  axis            (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]  axis            (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta = 1.0f, size_t axis = 1);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   float                   beta = 1.0f,
+                   int32_t                 axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSoftmaxLayer
      *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
      * @param[in] output Destination tensor. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent. Defaults to 1.f
-     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
+     *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, size_t axis = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] input  Original source tensor.
-     * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
-     */
-    void configure_reshape_input_kernel(const ICLTensor *input, const ICLTensor *output, size_t axis);
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Original source tensor.
-     * @param[in] output          Original destination tensor.
-     * @param[in] axis            (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
-     */
-    void configure_reshape_input_kernel(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *output, size_t axis);
-
-    MemoryGroup                    _memory_group;
-    CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
-    CLLogits1DNormKernel           _norm_kernel;
-    std::unique_ptr<ICLKernel>     _flatten_kernel_ptr;
-    CLReshapeLayerKernel           _reshape_kernel;
-    CLTensor                       _max;
-    CLTensor                       _sum;
-    CLTensor                       _tmp;
-    CLTensor                       _input_flattened;
-    CLTensor                       _output_flattened;
-    bool                           _needs_flattening;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 using CLSoftmaxLayer    = CLSoftmaxLayerGeneric<false>;
 using CLLogSoftmaxLayer = CLSoftmaxLayerGeneric<true>;
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CLSOFTMAXLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_CL_FUNCTIONS_CLSOFTMAXLAYER_H
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index b8e2bdc4c6..191f4863d5 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,23 @@
 #ifndef ARM_COMPUTE_CLSPACETOBATCHLAYER_H
 #define ARM_COMPUTE_CLSPACETOBATCHLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
-#include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFill.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLSpaceToBatchLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to spatial divide a tensor. This function calls the following OpenCL kernels/functions:
  *
- *  -# @ref CLMemsetKernel
+ *  -# @ref CLFill
  *  -# @ref CLSpaceToBatchLayerKernel
  */
 class CLSpaceToBatchLayer : public IFunction
@@ -54,12 +57,21 @@ public:
     /** Allow instances of this class to be moved */
     CLSpaceToBatchLayer &operator=(CLSpaceToBatchLayer &&) = default;
     /** Default destructor */
-    virtual ~CLSpaceToBatchLayer() = default;
+    ~CLSpaceToBatchLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |src2      |dst       |
+     * |:---------|:---------|:---------|:---------|
+     * |All       |S32       |S32       |All       |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
      */
     void configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
@@ -67,63 +79,85 @@ public:
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape     1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings        2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape     1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Set the input and output tensors. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Set the input and output tensors. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x   Block shape x value.
      * @param[in]  block_shape_y   Block shape y value.
-     * @param[in]  padding_left    The left padding of the output tensor.
-     * @param[in]  padding_right   The right padding of the output tensor.
+     * @param[in]  padding_left    The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape block shape tensor info with shape [M]. Data types supported: S32
-     * @param[in]  paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape block shape tensor info with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    paddings tensor info with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output info. Data types supported: same as @p input
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLSpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-    CLMemsetKernel            _memset_kernel;         /**< Memset kernel to run */
-    bool                      _has_padding;           /**< Flag to check if the output has padding */
+    std::unique_ptr<CLSpaceToBatchLayerKernel> _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+    CLFill                                     _fill;                  /**< Fill function to run */
+    bool                                       _has_padding;           /**< Flag to check if the output has padding */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSPACETOBATCHLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
index ac011dd998..1b0dfc2b74 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToDepthLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,17 @@
 #ifndef ARM_COMPUTE_CLSPACETODEPTHLAYER_H
 #define ARM_COMPUTE_CLSPACETODEPTHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLSpaceToDepthLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLSpaceToDepthLayerKernel. */
 class CLSpaceToDepthLayer : public IFunction
@@ -39,8 +42,27 @@ class CLSpaceToDepthLayer : public IFunction
 public:
     /** Default constructor */
     CLSpaceToDepthLayer();
+    /** Prevent instances of this class from being copied */
+    CLSpaceToDepthLayer(const CLSpaceToDepthLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLSpaceToDepthLayer &operator=(const CLSpaceToDepthLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLSpaceToDepthLayer(CLSpaceToDepthLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLSpaceToDepthLayer &operator=(CLSpaceToDepthLayer &&) = delete;
+    /** Default destructor */
+    ~CLSpaceToDepthLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape Block shape value.
@@ -53,7 +75,8 @@ public:
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayer.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -68,7 +91,7 @@ public:
     void run() override;
 
 private:
-    CLSpaceToDepthLayerKernel _space_to_depth_kernel; /**< CLSpaceToDepthLayerKernel to run */
+    std::unique_ptr<CLSpaceToDepthLayerKernel> _space_to_depth_kernel; /**< CLSpaceToDepthLayerKernel to run */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSPACETODEPTHLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLSplit.h b/arm_compute/runtime/CL/functions/CLSplit.h
index 87265a4c00..8d13755212 100644
--- a/arm_compute/runtime/CL/functions/CLSplit.h
+++ b/arm_compute/runtime/CL/functions/CLSplit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CL/functions/CLSlice.h"
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -40,6 +39,18 @@ namespace arm_compute
 class CLSplit : public CPPSplit<CLSlice, ICLTensor>
 {
 public:
+    /** CLSplit
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     */
+
     // Inherited methods overridden:
     void run() override;
 };
diff --git a/arm_compute/runtime/CL/functions/CLStackLayer.h b/arm_compute/runtime/CL/functions/CLStackLayer.h
index 9b204458c3..18745c8a4f 100644
--- a/arm_compute/runtime/CL/functions/CLStackLayer.h
+++ b/arm_compute/runtime/CL/functions/CLStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,14 +27,15 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLStackLayerKernel.h"
-
 #include <memory>
 #include <vector>
 
 namespace arm_compute
 {
+class CLCompileContext;
+class CLStackLayerKernel;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to stack tensors along an axis. This function calls the following kernel:
  *
@@ -46,8 +47,26 @@ class CLStackLayer : public IFunction
 public:
     /** Default constructor */
     CLStackLayer();
+    /** Prevent instances of this class from being copied */
+    CLStackLayer(const CLStackLayer &) = delete;
+    /** Prevent instances of this class from being copied */
+    CLStackLayer &operator=(const CLStackLayer &) = delete;
+    /** Prevent instances of this class to be moved */
+    CLStackLayer(CLStackLayer &&) = delete;
+    /** Prevent instances of this class to be moved */
+    CLStackLayer &operator=(CLStackLayer &&) = delete;
+    /** Default destructor */
+    ~CLStackLayer();
     /** Initialise the kernel's inputs vector and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @note Supported input tensor rank: up to 4
      *
      * @param[in]  input  The vectors containing all the tensors with the same shape to stack. Data types supported: All.
@@ -66,7 +85,10 @@ public:
      *                             Negative values wrap around
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      */
-    void configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output);
+    void configure(const CLCompileContext         &compile_context,
+                   const std::vector<ICLTensor *> &input,
+                   int                             axis,
+                   ICLTensor                      *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -84,9 +106,9 @@ public:
     void run() override;
 
 private:
-    std::vector<ICLTensor *>        _input;
-    std::vector<CLStackLayerKernel> _stack_kernels;
-    unsigned int                    _num_inputs;
+    std::vector<ICLTensor *>                         _input;
+    std::vector<std::unique_ptr<CLStackLayerKernel>> _stack_kernels;
+    unsigned int                                     _num_inputs;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLSTACKLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLStridedSlice.h b/arm_compute/runtime/CL/functions/CLStridedSlice.h
index bb2bc962d6..b1edc2481c 100644
--- a/arm_compute/runtime/CL/functions/CLStridedSlice.h
+++ b/arm_compute/runtime/CL/functions/CLStridedSlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,9 @@
 #ifndef ARM_COMPUTE_CL_STRIDED_SLICE_H
 #define ARM_COMPUTE_CL_STRIDED_SLICE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/runtime/CL/CLRuntimeContext.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+#include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
 {
@@ -32,11 +34,34 @@ namespace arm_compute
 class ICLTensor;
 
 /** Basic function to run @ref CLStridedSliceKernel */
-class CLStridedSlice : public ICLSimpleFunction
+class CLStridedSlice : public IFunction
 {
 public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    CLStridedSlice(CLRuntimeContext *ctx = nullptr);
+    /** Destructor */
+    ~CLStridedSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStridedSlice(const CLStridedSlice &) = delete;
+    /** Default move constructor */
+    CLStridedSlice(CLStridedSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLStridedSlice &operator=(const CLStridedSlice &) = delete;
+    /** Default move assignment operator */
+    CLStridedSlice &operator=(CLStridedSlice &&);
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  input            Source tensor. Data type supported: All.
@@ -49,9 +74,14 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
     /** Configure kernel
      *
      * @note Supported tensor rank: up to 4
@@ -67,9 +97,15 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask       = 0,
+                   int32_t                 end_mask         = 0,
+                   int32_t                 shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
      *
@@ -85,9 +121,77 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+/** Basic function to run @ref CLStridedSliceKernel */
+class CLStridedSlice : public ICLOperator
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  input            Source tensor info. Data type supported: All.
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask       = 0,
+                   int32_t                 end_mask         = 0,
+                   int32_t                 shrink_axis_mask = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSlice
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor info. Data type supported: All.
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_STRIDED_SLICE_H */
diff --git a/arm_compute/runtime/CL/functions/CLTableLookup.h b/arm_compute/runtime/CL/functions/CLTableLookup.h
deleted file mode 100644
index 1c11f076a3..0000000000
--- a/arm_compute/runtime/CL/functions/CLTableLookup.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTABLELOOKUP_H
-#define ARM_COMPUTE_CLTABLELOOKUP_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-class ICLLut;
-
-/** Basic function to run @ref CLTableLookupKernel */
-class CLTableLookup : public ICLSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input  First tensor input. Data types supported: U8 and S16
-     * @param[in]  lut    Input lookup table. Data types supported: U8 and S16
-     * @param[out] output Output tensor. Data types supported: U8 and S16
-     */
-    void configure(const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: U8 and S16
-     * @param[in]  lut             Input lookup table. Data types supported: U8 and S16
-     * @param[out] output          Output tensor. Data types supported: U8 and S16
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLLut *lut, ICLTensor *output);
-};
-}
-#endif /*ARM_COMPUTE_CLTABLELOOKUP_H */
diff --git a/arm_compute/runtime/CL/functions/CLThreshold.h b/arm_compute/runtime/CL/functions/CLThreshold.h
deleted file mode 100644
index d8ae6fbb34..0000000000
--- a/arm_compute/runtime/CL/functions/CLThreshold.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLTHRESHOLD_H
-#define ARM_COMPUTE_CLTHRESHOLD_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLThresholdKernel */
-class CLThreshold : public ICLSimpleFunction
-{
-public:
-    /** Initialise the function's source, destination, thresholds and threshold type
-     *
-     * @param[in]  input       First tensor input. Data types supported: U8.
-     * @param[out] output      Output tensor. Data types supported: U8.
-     * @param[in]  threshold   Threshold. If upper threshold is specified, this will be used as the lower threshold.
-     * @param[in]  false_value Value to assign when the condition is false.
-     * @param[in]  true_value  value to assign when the condition is true.
-     * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
-     * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
-     */
-    void configure(const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value = 0, uint8_t true_value = 0,
-                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
-    /** Initialise the function's source, destination, thresholds and threshold type
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           First tensor input. Data types supported: U8.
-     * @param[out] output          Output tensor. Data types supported: U8.
-     * @param[in]  threshold       Threshold. If upper threshold is specified, this will be used as the lower threshold.
-     * @param[in]  false_value     Value to assign when the condition is false.
-     * @param[in]  true_value      value to assign when the condition is true.
-     * @param[in]  type            Thresholding type. Can either be BINARY or RANGE.
-     * @param[in]  upper           Upper threshold. Only used with RANGE thresholding
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, uint8_t threshold,
-                   uint8_t false_value = 0, uint8_t true_value = 0,
-                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLTHRESHOLD_H */
diff --git a/arm_compute/runtime/CL/functions/CLTile.h b/arm_compute/runtime/CL/functions/CLTile.h
index 0dad9ad89d..4c414670a5 100644
--- a/arm_compute/runtime/CL/functions/CLTile.h
+++ b/arm_compute/runtime/CL/functions/CLTile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CLTILE_H
 #define ARM_COMPUTE_CLTILE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to run @ref CLTileKernel */
 class CLTile : public ICLSimpleFunction
@@ -38,6 +39,14 @@ class CLTile : public ICLSimpleFunction
 public:
     /** Set the source, destination of the kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input     Source tensor. Data type supported: All.
      * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
      * @param[out] output    Destination tensor. Same as @p input
@@ -50,7 +59,10 @@ public:
      * @param[in]  multiples       Contains the number of times the input tensor should be replicated on the given dimension.
      * @param[out] output          Destination tensor. Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTile
      *
      * @param[in] input     Source tensor info. Data type supported: All.
diff --git a/arm_compute/runtime/CL/functions/CLTranspose.h b/arm_compute/runtime/CL/functions/CLTranspose.h
index b2fdcda5c4..9dc977fbeb 100644
--- a/arm_compute/runtime/CL/functions/CLTranspose.h
+++ b/arm_compute/runtime/CL/functions/CLTranspose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,22 +24,43 @@
 #ifndef ARM_COMPUTE_CLTRANSPOSE_H
 #define ARM_COMPUTE_CLTRANSPOSE_H
 
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
-/** Basic function to transpose a matrix on OpenCL. This function calls the following OpenCL kernel:
- *
- *  -# @ref CLTransposeKernel
- *
- */
-class CLTranspose : public ICLSimpleFunction
+/** Basic function to execute an @ref opencl::kernels::ClTransposeKernel. */
+class CLTranspose : public IFunction
 {
 public:
+    /** Constructor */
+    CLTranspose();
+    /** Destructor */
+    ~CLTranspose();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLTranspose(const CLTranspose &) = delete;
+    /** Default move constructor */
+    CLTranspose(CLTranspose &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLTranspose &operator=(const CLTranspose &) = delete;
+    /** Default move assignment operator */
+    CLTranspose &operator=(CLTranspose &&) = default;
     /** Initialise the kernel's inputs and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  Input tensor. Data types supported: All.
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
@@ -59,7 +80,14 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CLTRANSPOSE_H */
diff --git a/arm_compute/runtime/CL/functions/CLUnstack.h b/arm_compute/runtime/CL/functions/CLUnstack.h
index 777da692be..a6eee43177 100644
--- a/arm_compute/runtime/CL/functions/CLUnstack.h
+++ b/arm_compute/runtime/CL/functions/CLUnstack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,9 +26,8 @@
 
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/CL/functions/CLStridedSlice.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
@@ -48,8 +47,16 @@ public:
     CLUnstack();
     /** Set the input, output and unstacking axis.
      *
-     * @param[in]     input         A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector A vector of tensors. Data types supported: Same as @p input.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in]     input         A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
      *                              Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
@@ -58,17 +65,20 @@ public:
     /** Set the input, output and unstacking axis.
      *
      * @param[in]     compile_context The compile context to be used.
-     * @param[in]     input           A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector   A vector of tensors. Data types supported: Same as @p input.
+     * @param[in]     input           A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector   A vector of tensors. Data types supported: same as @p input.
      *                                Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis            The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *input,
+                   const std::vector<ICLTensor *> &output_vector,
+                   int                             axis);
     /** Static function to check if given info will lead to a valid configuration of @ref CLUnstack
      *
-     * @param[in] input         Input tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-     * @param[in] output_vector Vector of output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input         Input tensor info. Data type supported: All.
+     * @param[in] output_vector Vector of output tensors' info. Data types supported: same as @p input.
      * @param[in] axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      * @return a status
diff --git a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h b/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
deleted file mode 100644
index 5f4f57f824..0000000000
--- a/arm_compute/runtime/CL/functions/CLUpsampleLayer.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLUPSAMPLELAYER_H
-#define ARM_COMPUTE_CLUPSAMPLELAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLUpsampleLayerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLUpsampleLayerKernel */
-class CLUpsampleLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    CLUpsampleLayer();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLUpsampleLayer(const CLUpsampleLayer &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLUpsampleLayer &operator=(const CLUpsampleLayer &) = delete;
-    /** Allow instances of this class to be moved */
-    CLUpsampleLayer(CLUpsampleLayer &&) = default;
-    /** Allow instances of this class to be moved */
-    CLUpsampleLayer &operator=(CLUpsampleLayer &&) = default;
-    /** Default destructor */
-    virtual ~CLUpsampleLayer() = default;
-
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
-     * @param[in]  input             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output            Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info              Contains stride information described in @ref Size2D.
-     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
-     */
-    void configure(ICLTensor *input, ICLTensor *output,
-                   const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
-     * @param[in]  compile_context   The compile context to be used.
-     * @param[in]  input             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output            Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info              Contains stride information described in @ref Size2D.
-     * @param[in]  upsampling_policy Defines the policy to fill the intermediate pixels.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
-                   const Size2D &info, const InterpolationPolicy upsampling_policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
-     *
-     * @param[in] input             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output            Destination tensor info. Data types supported: same as @p input.
-     * @param[in] info              Contains  stride information described in @ref Size2D.
-     * @param[in] upsampling_policy Defines the policy to fill the intermediate pixels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Size2D &info, const InterpolationPolicy upsampling_policy);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    CLUpsampleLayerKernel _upsample;
-    ICLTensor            *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLUPSAMPLELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLWarpAffine.h b/arm_compute/runtime/CL/functions/CLWarpAffine.h
deleted file mode 100644
index 1a2fe9d4d5..0000000000
--- a/arm_compute/runtime/CL/functions/CLWarpAffine.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPAFFINE_H
-#define ARM_COMPUTE_CLWARPAFFINE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLWarpAffineKernel for AFFINE transformation */
-class CLWarpAffine : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in,out] input                 Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     matrix                The affine matrix. Must be 2x3 of type float.
-     *                                      The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source temspr. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8.
-     * @param[in]     matrix                The affine matrix. Must be 2x3 of type float.
-     *                                      The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLWARPAFFINE_H */
diff --git a/arm_compute/runtime/CL/functions/CLWarpPerspective.h b/arm_compute/runtime/CL/functions/CLWarpPerspective.h
deleted file mode 100644
index 5db9ec4cf0..0000000000
--- a/arm_compute/runtime/CL/functions/CLWarpPerspective.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWARPPERSPECTIVE_H
-#define ARM_COMPUTE_CLWARPPERSPECTIVE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLWarpPerspectiveKernel for PERSPECTIVE transformation */
-class CLWarpPerspective : public ICLSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     matrix                The perspective matrix. Must be 3x3 of type float.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in]     compile_context       The compile context to be used.
-     * @param[in,out] input                 Source tensor. Data types supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: U8.
-     * @param[in]     matrix                The perspective matrix. Must be 3x3 of type float.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_CLWARPPERSPECTIVE_H */
diff --git a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
index c1de5f15ce..efea9a1550 100644
--- a/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,30 +24,31 @@
 #ifndef ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLWinogradFilterTransformKernel.h"
-#include "arm_compute/core/CL/kernels/CLWinogradOutputTransformKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-#include "arm_compute/runtime/CL/functions/CLWinogradInputTransform.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
  *
- *  -# @ref CLWinogradInputTransform
- *  -# @ref CLWinogradFilterTransformKernel (only once)
- *  -# @ref CLGEMM
- *  -# @ref CLWinogradOutputTransformKernel
+ *  -# @ref opencl::ClWinogradConv2d
  *
  */
 class CLWinogradConvolutionLayer : public IFunction
 {
 public:
-    /** Default constructor */
+    /** Default Constructor */
     CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Default Destructor */
+    ~CLWinogradConvolutionLayer();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLWinogradConvolutionLayer(const CLWinogradConvolutionLayer &) = delete;
     /** Default move constructor */
@@ -58,6 +59,16 @@ public:
     CLWinogradConvolutionLayer &operator=(CLWinogradConvolutionLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
      * @note  Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
      *
@@ -73,8 +84,13 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Set the input and output tensors.
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
@@ -93,8 +109,14 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const CLCompileContext    &compile_context,
+                   ICLTensor                 *input,
+                   const ICLTensor           *weights,
+                   const ICLTensor           *biases,
+                   ICLTensor                 *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CLWinogradConvolutionLayer
      *
      * @note: This function only works with 3x3,3x1,1x3,5x5,5x1 and 1x5 kernels along with unit strides for both NCHW and NHWC data layout
@@ -114,24 +136,21 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                     _memory_group;
-    CLGEMM                          _batched_mm;
-    CLWinogradInputTransform        _input_transform;
-    CLWinogradFilterTransformKernel _filter_transform;
-    CLWinogradOutputTransformKernel _output_transform;
-    CLTensor                        _input0;
-    CLTensor                        _input1;
-    CLTensor                        _batched_mm_output;
-    const ICLTensor                *_original_weights;
-    bool                            _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLWINOGRADCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h b/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
deleted file mode 100644
index 11a402e51d..0000000000
--- a/arm_compute/runtime/CL/functions/CLWinogradInputTransform.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLWINOGRADINPUTTRANSFORM_H
-#define ARM_COMPUTE_CLWINOGRADINPUTTRANSFORM_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to execute a @ref CLWinogradInputTransformKernel. */
-class CLWinogradInputTransform : public ICLSimpleFunction
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16,F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /** Set the input and output tensors.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           The input tensor to transform. Data types supported: F16,F32
-     * @param[in] output          The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const WinogradInfo &winograd_info);
-    /**  Static function to check if given info will lead to a valid configuration of @ref CLWinogradInputTransform.
-     *
-     * @note Winograd input transform supports the following configurations for NCWH data layout
-     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
-     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     * @note Winograd input transform supports the following configurations for NHWC data layout
-     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
-     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
-     *
-     *       Strides: only unit strides
-     *
-     * @param[in] input         The input tensor to transform. Data types supported: F16,F32
-     * @param[in] output        The output tensor. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
-     * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info);
-};
-}
-#endif /*ARM_COMPUTE_CLWINOGRADINPUTTRANSFORM_H */
diff --git a/arm_compute/runtime/CL/functions/CLYOLOLayer.h b/arm_compute/runtime/CL/functions/CLYOLOLayer.h
deleted file mode 100644
index e70d84b97e..0000000000
--- a/arm_compute/runtime/CL/functions/CLYOLOLayer.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLYOLOLAYER_H
-#define ARM_COMPUTE_CLYOLOLAYER_H
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLYOLOLayerKernel that performs a partial activation on the input
- *
- *  For each box, activate only:
- *    - x and y position (channel 0 and 1 of each box)
- *    - objectiveness    (channel 4 of each box)
- *    - classes          (channel 5 to (classes - 5) of each box)
- */
-class CLYOLOLayer : public ICLSimpleFunction
-{
-public:
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
-     *
-     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                             of the activation function. Data types supported: F16/F32.
-     * @param[out]     output      Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info    Activation layer parameters.
-     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
-     *
-     * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: F16/F32.
-     * @param[out]     output          Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info        Activation layer parameters.
-     * @param[in]      num_classes     Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLYOLOLayer
-     *
-     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                        of the activation function. Data types supported: F16/F32.
-     * @param[in] output      Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info    Activation layer information.
-     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_CLYOLOLAYER_H */
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h
deleted file mode 100644
index 47d787445e..0000000000
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTION_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTION_H
-
-#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h"
-#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionValhall.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** CLGEMMKernelSelection factory class */
-class CLGEMMKernelSelectionFactory final
-{
-public:
-    /** Static method to select the GEMM kernel accordingly with the GPU target and GEMM's dimensionality
-     *
-     * @param[in] gpu GPU target
-     *
-     * @return CLGEMMKernelSelection class
-     */
-    static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
-    {
-        switch(get_arch_from_target(gpu))
-        {
-            case GPUTarget::MIDGARD:
-                return support::cpp14::make_unique<CLGEMMKernelSelectionMidgard>(gpu);
-            case GPUTarget::BIFROST:
-                return support::cpp14::make_unique<CLGEMMKernelSelectionBifrost>(gpu);
-            case GPUTarget::VALHALL:
-                return support::cpp14::make_unique<CLGEMMKernelSelectionValhall>(gpu);
-            default:
-                ARM_COMPUTE_ERROR("Not supported GPU target");
-        }
-    }
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTION_H */
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
deleted file mode 100644
index 04906e317f..0000000000
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTIONBIFROST_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTIONBIFROST_H
-
-#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Bifrost based OpenCL GEMMKernel selection */
-class CLGEMMKernelSelectionBifrost final : public ICLGEMMKernelSelection
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMKernelSelectionBifrost(GPUTarget gpu);
-
-    // Inherited overridden method
-    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
-
-private:
-    CLGEMMKernelType g76_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType g71_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTIONBIFROST_H */
diff --git a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h b/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
deleted file mode 100644
index c0bc4fc507..0000000000
--- a/arm_compute/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMKERNELSELECTIONMIDGARD_H
-#define ARM_COMPUTE_CLGEMMKERNELSELECTIONMIDGARD_H
-
-#include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
-
-namespace arm_compute
-{
-namespace cl_gemm
-{
-/** Midgard based OpenCL GEMMKernel selection */
-class CLGEMMKernelSelectionMidgard final : public ICLGEMMKernelSelection
-{
-public:
-    /** Constructor
-     *
-     * @param[in] gpu GPU target
-     */
-    CLGEMMKernelSelectionMidgard(GPUTarget gpu);
-
-    // Inherited overridden method
-    CLGEMMKernelType select_kernel(const CLGEMMKernelSelectionParams &params) override;
-
-private:
-    CLGEMMKernelType default_f32(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_f16(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-    CLGEMMKernelType default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant);
-};
-} // namespace cl_gemm
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMKERNELSELECTIONMIDGARD_H */
diff --git a/arm_compute/runtime/CL/tuners/BifrostTuner.h b/arm_compute/runtime/CL/tuners/BifrostTuner.h
deleted file mode 100644
index 06f76cb459..0000000000
--- a/arm_compute/runtime/CL/tuners/BifrostTuner.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TUNERS_BIFROST_TUNER_H
-#define ARM_COMPUTE_TUNERS_BIFROST_TUNER_H
-
-#include "arm_compute/runtime/CL/ICLTuner.h"
-
-namespace arm_compute
-{
-namespace tuners
-{
-/** Bifrost based OpenCL tuner implementation */
-class BifrostTuner final : public ICLTuner
-{
-public:
-    // Inherited overriden methods
-    void tune_kernel_static(ICLKernel &kernel) override;
-    void tune_kernel_dynamic(ICLKernel &kernel) override;
-};
-} // namespace tuners
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_TUNERS_BIFROST_TUNER_H */
diff --git a/arm_compute/runtime/CL/tuners/CLLWSList.h b/arm_compute/runtime/CL/tuners/CLLWSList.h
deleted file mode 100644
index 61a1d793fd..0000000000
--- a/arm_compute/runtime/CL/tuners/CLLWSList.h
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CL_LWS_LIST_H
-#define ARM_COMPUTE_CL_LWS_LIST_H
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/CL/CLTunerTypes.h"
-#include "support/ToolchainSupport.h"
-#include <memory>
-
-namespace arm_compute
-{
-namespace cl_tuner
-{
-constexpr unsigned int max_lws_supported_x{ 64u };
-constexpr unsigned int max_lws_supported_y{ 32u };
-constexpr unsigned int max_lws_supported_z{ 32u };
-
-/** Interface for LWS lists */
-class ICLLWSList
-{
-public:
-    /** Constructor */
-    ICLLWSList() = default;
-    /** Copy Constructor */
-    ICLLWSList(const ICLLWSList &) = default;
-    /** Move Constructor */
-    ICLLWSList(ICLLWSList &&) noexcept(true) = default;
-    /** Assignment */
-    ICLLWSList &operator=(const ICLLWSList &) = default;
-    /** Move Assignment */
-    ICLLWSList &operator=(ICLLWSList &&) noexcept(true) = default;
-    /** Destructor */
-    virtual ~ICLLWSList() = default;
-
-    /** Return the LWS value at the given index.
-     *
-     * @return LWS value at the given index
-     */
-    virtual cl::NDRange operator[](size_t) = 0;
-
-    /** LWS list size.
-     *
-     * @return LWS list size
-     */
-    virtual size_t size() = 0;
-};
-
-/** Non instantiable base class for LWS combinations that use Index2Cooard mapping */
-class CLLWSList : public ICLLWSList
-{
-protected:
-    /* Shape of 3-D search space */
-    TensorShape search_space_shape{ 0, 0, 0 };
-
-    /** Constructor */
-    CLLWSList() = default;
-    /** Copy Constructor */
-    CLLWSList(const CLLWSList &) = default;
-    /** Move Constructor */
-    CLLWSList(CLLWSList &&) noexcept(true) = default;
-    /** Assignment */
-    CLLWSList &operator=(const CLLWSList &) = default;
-    /** Move Assignment */
-    CLLWSList &operator=(CLLWSList &&) noexcept(true) = default;
-    /** Destructor */
-    virtual ~CLLWSList() = default;
-
-    // Inherited methods overridden:
-    virtual size_t size() override;
-};
-
-/** Exhaustive list of all possible LWS values */
-class CLLWSListExhaustive : public CLLWSList
-{
-public:
-    /** Prevent default constructor calls */
-    CLLWSListExhaustive() = delete;
-    /** Constructor */
-    CLLWSListExhaustive(const cl::NDRange &gws);
-    /** Copy Constructor */
-    CLLWSListExhaustive(const CLLWSListExhaustive &) = default;
-    /** Move Constructor */
-    CLLWSListExhaustive(CLLWSListExhaustive &&) noexcept(true) = default;
-    /** Assignment */
-    CLLWSListExhaustive &operator=(const CLLWSListExhaustive &) = default;
-    /** Move Assignment */
-    CLLWSListExhaustive &operator=(CLLWSListExhaustive &&) noexcept(true) = default;
-    /** Destructor */
-    ~CLLWSListExhaustive() = default;
-
-    // Inherited methods overridden:
-    cl::NDRange operator[](size_t) override;
-};
-
-/** A subset of LWS values that are either factors of gws when gws[2] < 16 or power of 2 */
-class CLLWSListNormal : public CLLWSList
-{
-public:
-    /** Constructor */
-    CLLWSListNormal(const cl::NDRange &gws);
-    /** Copy Constructor */
-    CLLWSListNormal(const CLLWSListNormal &) = default;
-    /** Move Constructor */
-    CLLWSListNormal(CLLWSListNormal &&) noexcept(true) = default;
-    /** Assignment */
-    CLLWSListNormal &operator=(const CLLWSListNormal &) = default;
-    /** Move Assignment */
-    CLLWSListNormal &operator=(CLLWSListNormal &&) noexcept(true) = default;
-    /** Destructor */
-    ~CLLWSListNormal() = default;
-
-    // Inherited methods overridden:
-    cl::NDRange operator[](size_t) override;
-
-protected:
-    std::vector<unsigned int> _lws_x{};
-    std::vector<unsigned int> _lws_y{};
-    std::vector<unsigned int> _lws_z{};
-
-    /** Prevent default constructor calls */
-    CLLWSListNormal() = default;
-
-private:
-    /** Utility function used to initialize the LWS values to test.
-     *  Only the LWS values which are power of 2 or satisfy the modulo conditions with GWS are taken into account by the CLTuner
-     *
-     * @param[in, out] lws         Vector of LWS to test
-     * @param[in]      gws         Size of the specific GWS
-     * @param[in]      lws_max     Max LWS value allowed to be tested
-     * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
-     */
-    void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
-};
-
-/** A minimal subset of LWS values that only have 1,2 and 4/8 */
-class CLLWSListRapid : public CLLWSListNormal
-{
-public:
-    /** Prevent default constructor calls */
-    CLLWSListRapid() = delete;
-    /** Constructor */
-    CLLWSListRapid(const cl::NDRange &gws);
-    /** Copy Constructor */
-    CLLWSListRapid(const CLLWSListRapid &) = default;
-    /** Move Constructor */
-    CLLWSListRapid(CLLWSListRapid &&) noexcept(true) = default;
-    /** Assignment */
-    CLLWSListRapid &operator=(const CLLWSListRapid &) = default;
-    /** Move Assignment */
-    CLLWSListRapid &operator=(CLLWSListRapid &&) noexcept(true) = default;
-    /** Destructor */
-    virtual ~CLLWSListRapid() = default;
-
-private:
-    /** Utility function used to initialize the LWS values to test.
-     *  Only the LWS values that have 1,2 and 4/8 for each dimension are taken into account by the CLTuner
-     *
-     * @param[in, out] lws     Vector of LWS to test
-     * @param[in]      lws_max Max LWS value allowed to be tested
-     */
-    void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max);
-};
-
-/** Factory to construct an ICLLWSList object based on the CL tuner mode */
-class CLLWSListFactory final
-{
-public:
-    /** Construct an ICLLWSList object for the given tuner mode and gws configuration.
-     *
-     * @return unique_ptr to the requested ICLLWSList implementation.
-     */
-    static std::unique_ptr<ICLLWSList> get_lws_list(CLTunerMode mode, const cl::NDRange &gws)
-    {
-        switch(mode)
-        {
-            case CLTunerMode::EXHAUSTIVE:
-                return arm_compute::support::cpp14::make_unique<CLLWSListExhaustive>(gws);
-            case CLTunerMode::NORMAL:
-                return arm_compute::support::cpp14::make_unique<CLLWSListNormal>(gws);
-            case CLTunerMode::RAPID:
-                return arm_compute::support::cpp14::make_unique<CLLWSListRapid>(gws);
-            default:
-                return nullptr;
-        }
-    }
-};
-} // namespace cl_tuner
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_LWS_LIST_H */
diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
new file mode 100644
index 0000000000..5f6d12b4a7
--- /dev/null
+++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TUNINGPARAMETERS_LIST_H
+#define ARM_COMPUTE_CL_TUNINGPARAMETERS_LIST_H
+
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/runtime/CL/CLTunerTypes.h"
+#include "arm_compute/runtime/CL/CLTuningParams.h"
+
+#include "support/ToolchainSupport.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cl_tuner
+{
+/** Interface for Tuning Parameters lists
+ *
+ * The tuning parameter lists contain a set of tuning parameters to estimate.
+ * There are 3 tuner modes, each using its specific list:
+ *  - Exhaustive tuner mode is the slowest during the tuning but will find faster tuning parameters
+ *  - Normal tuner mode is the average modality in terms of tuning time and tuning parameters found
+ *  - Rapid tuner mode is the fastest but the tuning parameters might not be the fastest
+ *
+ */
+class ICLTuningParametersList
+{
+public:
+    /** Constructor */
+    ICLTuningParametersList() = default;
+    /** Copy Constructor */
+    ICLTuningParametersList(const ICLTuningParametersList &) = default;
+    /** Move Constructor */
+    ICLTuningParametersList(ICLTuningParametersList &&) noexcept(true) = default;
+    /** Assignment */
+    ICLTuningParametersList &operator=(const ICLTuningParametersList &) = default;
+    /** Move Assignment */
+    ICLTuningParametersList &operator=(ICLTuningParametersList &&) noexcept(true) = default;
+    /** Destructor */
+    virtual ~ICLTuningParametersList() = default;
+
+    /** Return the tuning parameter values at the given index.
+     *
+     * @return tuning parameter values at the given index
+     */
+    virtual CLTuningParams operator[](size_t) = 0;
+
+    /** Tuning parameters list size.
+     *
+     * @return Tuning parameters list size
+     */
+    virtual size_t size() = 0;
+};
+
+/** Construct an ICLTuningParametersList object for the given tuner mode and gws configuration.
+ *
+ * @param[in] tuning_info Tuning info containng which parameters to tune and the tuner mode
+ * @param[in] gws         Global worksize values
+ *
+ * @return unique_ptr to the requested ICLTuningParametersList implementation.
+ */
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws);
+
+} // namespace cl_tuner
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_TUNINGPARAMETERS_LIST_H */
diff --git a/arm_compute/runtime/CL/tuners/MidgardTuner.h b/arm_compute/runtime/CL/tuners/MidgardTuner.h
deleted file mode 100644
index 2609f328ce..0000000000
--- a/arm_compute/runtime/CL/tuners/MidgardTuner.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TUNERS_MIDGARD_TUNER_H
-#define ARM_COMPUTE_TUNERS_MIDGARD_TUNER_H
-
-#include "arm_compute/runtime/CL/ICLTuner.h"
-
-namespace arm_compute
-{
-namespace tuners
-{
-/** Midgard based OpenCL tuner implementation */
-class MidgardTuner final : public ICLTuner
-{
-public:
-    // Inherited overriden methods
-    void tune_kernel_static(ICLKernel &kernel) override;
-    void tune_kernel_dynamic(ICLKernel &kernel) override;
-};
-} // namespace tuners
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_TUNERS_MIDGARD_TUNER_H */
diff --git a/arm_compute/runtime/CPP/CPPFunctions.h b/arm_compute/runtime/CPP/CPPFunctions.h
index c196381ce0..07c1c7b340 100644
--- a/arm_compute/runtime/CPP/CPPFunctions.h
+++ b/arm_compute/runtime/CPP/CPPFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/CPP/CPPScheduler.h b/arm_compute/runtime/CPP/CPPScheduler.h
index c8de41bf20..7f70b5fa1f 100644
--- a/arm_compute/runtime/CPP/CPPScheduler.h
+++ b/arm_compute/runtime/CPP/CPPScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,21 @@
 #ifndef ARM_COMPUTE_CPPSCHEDULER_H
 #define ARM_COMPUTE_CPPSCHEDULER_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IScheduler.h"
 
 #include <memory>
 
 namespace arm_compute
 {
-/** C++11 implementation of a pool of threads to automatically split a kernel's execution among several threads. */
+/** C++11 implementation of a pool of threads to automatically split a kernel's execution among several threads.
+ *
+ * It has 2 scheduling modes: Linear or Fanout (please refer to the implementation for details)
+ * The mode is selected automatically based on the runtime environment. However it can be forced via an environment
+ * variable ARM_COMPUTE_CPP_SCHEDULER_MODE. e.g.:
+ * ARM_COMPUTE_CPP_SCHEDULER_MODE=linear      # Force select the linear scheduling mode
+ * ARM_COMPUTE_CPP_SCHEDULER_MODE=fanout      # Force select the fanout scheduling mode
+*/
 class CPPScheduler final : public IScheduler
 {
 public:
@@ -38,33 +46,20 @@ public:
     CPPScheduler();
     /** Default destructor */
     ~CPPScheduler();
-    /** Sets the number of threads the scheduler will use to run the kernels.
-     *
-     * @param[in] num_threads If set to 0, then the maximum number of threads supported by C++11 will be used, otherwise the number of threads specified.
-     */
-    void set_num_threads(unsigned int num_threads) override;
-    /** Returns the number of threads that the CPPScheduler has in his pool.
-     *
-     * @return Number of threads available in CPPScheduler.
-     */
-    unsigned int num_threads() const override;
 
     /** Access the scheduler singleton
      *
-     * @note this method has been deprecated and will be remover in the upcoming releases
+     * @note this method has been deprecated and will be remover in future releases
      * @return The scheduler
      */
     static CPPScheduler &get();
-    /** Multithread the execution of the passed kernel if possible.
-     *
-     * The kernel will run on a single thread if any of these conditions is true:
-     * - ICPPKernel::is_parallelisable() returns false
-     * - The scheduler has been initialized with only one thread.
-     *
-     * @param[in] kernel Kernel to execute.
-     * @param[in] hints  Hints for the scheduler.
-     */
-    void schedule(ICPPKernel *kernel, const Hints &hints) override;
+
+    // Inherited functions overridden
+    void         set_num_threads(unsigned int num_threads) override;
+    void         set_num_threads_with_affinity(unsigned int num_threads, BindFunc func) override;
+    unsigned int num_threads() const override;
+    void         schedule(ICPPKernel *kernel, const Hints &hints) override;
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
 
 protected:
     /** Will run the workloads in parallel using num_threads
diff --git a/arm_compute/runtime/CPP/ICPPSimpleFunction.h b/arm_compute/runtime/CPP/ICPPSimpleFunction.h
index 999e4363a9..49b5cd8d4a 100644
--- a/arm_compute/runtime/CPP/ICPPSimpleFunction.h
+++ b/arm_compute/runtime/CPP/ICPPSimpleFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index b6f55b5bf8..9af4ed6208 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,11 +58,19 @@ public:
      * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[out] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
      * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
-     * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
      * @param[in]  info             (Optional) BoxNMSLimitInfo information.
      */
-    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    void configure(const ITensor        *scores_in,
+                   const ITensor        *boxes_in,
+                   const ITensor        *batch_splits_in,
+                   ITensor              *scores_out,
+                   ITensor              *boxes_out,
+                   ITensor              *classes,
+                   ITensor              *batch_splits_out = nullptr,
+                   ITensor              *keeps            = nullptr,
+                   ITensor              *keeps_size       = nullptr,
+                   const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
      *
      * @param[in] scores_in        The scores input tensor of size [count, num_classes]. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -76,14 +84,21 @@ public:
      * @param[in] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[in] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
      * @param[in] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
-     * @param[in] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: U32.
      * @param[in] info             (Optional) BoxNMSLimitInfo information.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out,
-                           const ITensorInfo *classes,
-                           const ITensorInfo *batch_splits_out = nullptr, const ITensorInfo *keeps = nullptr, const ITensorInfo *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    static Status validate(const ITensorInfo    *scores_in,
+                           const ITensorInfo    *boxes_in,
+                           const ITensorInfo    *batch_splits_in,
+                           const ITensorInfo    *scores_out,
+                           const ITensorInfo    *boxes_out,
+                           const ITensorInfo    *classes,
+                           const ITensorInfo    *batch_splits_out = nullptr,
+                           const ITensorInfo    *keeps            = nullptr,
+                           const ITensorInfo    *keeps_size       = nullptr,
+                           const BoxNMSLimitInfo info             = BoxNMSLimitInfo());
     // Inherited methods overridden:
     void run() override;
 
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index 5717201ec7..dc8c8e76ba 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
 #define ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -52,7 +51,11 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox, ITensor *output, DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
+    void configure(const ITensor           *input_loc,
+                   const ITensor           *input_conf,
+                   const ITensor           *input_priorbox,
+                   ITensor                 *output,
+                   DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
      *
      * @param[in] input_loc      The mbox location input tensor info. Data types supported: F32.
@@ -63,7 +66,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output,
+    static Status validate(const ITensorInfo       *input_loc,
+                           const ITensorInfo       *input_conf,
+                           const ITensorInfo       *input_priorbox,
+                           const ITensorInfo       *output,
                            DetectionOutputLayerInfo info = DetectionOutputLayerInfo());
     // Inherited methods overridden:
     void run() override;
@@ -82,12 +88,12 @@ private:
     int _num_priors;
     int _num;
 
-    std::vector<LabelBBox> _all_location_predictions;
+    std::vector<LabelBBox>                         _all_location_predictions;
     std::vector<std::map<int, std::vector<float>>> _all_confidence_scores;
-    std::vector<BBox> _all_prior_bboxes;
-    std::vector<std::array<float, 4>> _all_prior_variances;
-    std::vector<LabelBBox> _all_decode_bboxes;
-    std::vector<std::map<int, std::vector<int>>> _all_indices;
+    std::vector<BBox>                              _all_prior_bboxes;
+    std::vector<std::array<float, 4>>              _all_prior_variances;
+    std::vector<LabelBBox>                         _all_decode_bboxes;
+    std::vector<std::map<int, std::vector<int>>>   _all_indices;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_DETECTION_OUTPUT_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
index cb74ca9077..a40e4f9ecb 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_CPP_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -65,8 +64,14 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionPostProcessLayer
      *
      * @param[in]  input_box_encoding The bounding box input tensor info. Data types supported: F32/QASYMM8/QASYMM8_SIGNED.
@@ -80,8 +85,13 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
index 4ad2fd0001..af6afc6029 100644
--- a/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
+++ b/arm_compute/runtime/CPP/functions/CPPNonMaximumSuppression.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
 #define ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -48,7 +47,12 @@ public:
      * @param[in]  nms_threshold   The threshold used in non maximum suppression.
      *
      */
-    void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
+    void configure(const ITensor *bboxes,
+                   const ITensor *scores,
+                   ITensor       *indices,
+                   unsigned int   max_output_size,
+                   const float    score_threshold,
+                   const float    nms_threshold);
 
     /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
      *
@@ -60,8 +64,12 @@ public:
      * @param[in]  nms_threshold   The threshold used in non maximum suppression.
      *
      */
-    static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-                           const float score_threshold, const float nms_threshold);
+    static Status validate(const ITensorInfo *bboxes,
+                           const ITensorInfo *scores,
+                           const ITensorInfo *indices,
+                           unsigned int       max_output_size,
+                           const float        score_threshold,
+                           const float        nms_threshold);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPP_NONMAXIMUMSUPPRESSION_LAYER_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPPermute.h b/arm_compute/runtime/CPP/functions/CPPPermute.h
index 5a6d8ea106..232da41b8e 100644
--- a/arm_compute/runtime/CPP/functions/CPPPermute.h
+++ b/arm_compute/runtime/CPP/functions/CPPPermute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPPERMUTE_H
 #define ARM_COMPUTE_CPPPERMUTE_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -53,5 +52,5 @@ public:
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPPERMUTE_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPSplit.h b/arm_compute/runtime/CPP/functions/CPPSplit.h
index 6adcbc3323..9be081f5bb 100644
--- a/arm_compute/runtime/CPP/functions/CPPSplit.h
+++ b/arm_compute/runtime/CPP/functions/CPPSplit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2021,2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,9 +29,6 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include "support/ToolchainSupport.h"
-
 #include "arm_compute/runtime/IFunction.h"
 
 namespace arm_compute
@@ -41,14 +38,13 @@ template <typename SliceType, typename TensorInterfaceType = ITensor>
 class CPPSplit : public IFunction
 {
 public:
-    CPPSplit()
-        : _outputs_vector(), _slice_functions(), _num_outputs(0)
+    CPPSplit() : _outputs_vector(), _slice_functions(), _num_outputs(0)
     {
     }
     /** Static function to check if given info will lead to a valid configuration of @ref CPPSplit
      *
-     * @param[in] input   The input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in] outputs A vector containing the output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input   The input tensor info. Data types supported: All.
+     * @param[in] outputs A vector containing the output tensors' info. Data types supported: same as @p input.
      *                    The output tensors should match the input tensor dimensions for all shape dimensions apart
      *                    from the split dimension
      * @param[in] axis    Axis on which to split the input.
@@ -66,14 +62,16 @@ public:
         unsigned int total_output_shape_size = 0;
 
         // Sum the output sizes and fall back to evenly-sized splits if any are zero
-        const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(), [&total_output_shape_size](ITensorInfo * info)
-        {
-            unsigned int output_shape_size = info->tensor_shape().total_size();
-            total_output_shape_size += output_shape_size;
-            return output_shape_size == 0;
-        });
-
-        if(using_split_shapes)
+        const bool using_split_shapes = std::none_of(outputs.begin(), outputs.end(),
+                                                     [&total_output_shape_size](ITensorInfo *info)
+                                                     {
+                                                         unsigned int output_shape_size =
+                                                             info->tensor_shape().total_size();
+                                                         total_output_shape_size += output_shape_size;
+                                                         return output_shape_size == 0;
+                                                     });
+
+        if (using_split_shapes)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() != total_output_shape_size);
         }
@@ -85,10 +83,10 @@ public:
 
         // Validate output tensors
         unsigned int axis_offset = 0;
-        for(const auto &output : outputs)
+        for (const auto &output : outputs)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-            if(using_split_shapes)
+            if (using_split_shapes)
             {
                 output_shape = output->tensor_shape();
                 ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() == 0);
@@ -99,14 +97,17 @@ public:
             // Start/End coordinates
             Coordinates start_coords;
             Coordinates end_coords;
-            for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+            for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
             {
                 end_coords.set(d, -1);
             }
 
             // Output auto inizialitation if not yet initialized
             TensorInfo tmp_output_info = *output->clone();
-            auto_init_if_empty(tmp_output_info, input->clone()->set_is_resizable(true).set_tensor_shape(output_shape));
+            if (tmp_output_info.tensor_shape().total_size() == 0)
+            {
+                tmp_output_info = input->clone()->set_is_resizable(true).set_tensor_shape(output_shape);
+            }
 
             // Update coordinate on axis
             start_coords.set(axis, axis_offset);
@@ -127,7 +128,8 @@ public:
      *                     from the split dimension.
      * @param[in]  axis    Axis on which to split the input.
      */
-    void configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
+    void
+    configure(const TensorInterfaceType *input, const std::vector<TensorInterfaceType *> &outputs, unsigned int axis)
     {
         // Create Slice functions
         _num_outputs = outputs.size();
@@ -135,17 +137,16 @@ public:
 
         // Extract output tensor info
         std::vector<ITensorInfo *> outputs_info;
-        for(auto &output : outputs)
+        for (auto &output : outputs)
         {
             ARM_COMPUTE_ERROR_ON_NULLPTR(output);
             outputs_info.emplace_back(output->info());
         }
 
         // If any of the outputs have a zero size, fall-back to using evenly-sized output splits
-        const bool outputs_have_sizes = std::none_of(outputs_info.begin(), outputs_info.end(), [](ITensorInfo * info)
-        {
-            return info->tensor_shape().total_size() == 0;
-        });
+        const bool outputs_have_sizes =
+            std::none_of(outputs_info.begin(), outputs_info.end(),
+                         [](ITensorInfo *info) { return info->tensor_shape().total_size() == 0; });
 
         // Validate
         ARM_COMPUTE_ERROR_THROW_ON(CPPSplit::validate(input->info(), outputs_info, axis));
@@ -153,12 +154,13 @@ public:
         unsigned int axis_offset = 0;
         unsigned int i           = 0;
 
-        for(const auto &output_info : outputs_info)
+        for (const auto &output_info : outputs_info)
         {
             // Get output shape
-            TensorShape output_shape = (outputs_have_sizes ?
-                                        output_info->tensor_shape() :
-                                        arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
+            TensorShape output_shape =
+                (outputs_have_sizes
+                     ? output_info->tensor_shape()
+                     : arm_compute::misc::shape_calculator::compute_split_shape(input->info(), axis, _num_outputs));
 
             const size_t axis_split_step = output_shape[axis];
 
@@ -166,7 +168,7 @@ public:
             Coordinates start_coords;
             Coordinates end_coords;
 
-            for(unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
+            for (unsigned int d = 0; d < output_shape.num_dimensions(); ++d)
             {
                 end_coords.set(d, -1);
             }
diff --git a/arm_compute/runtime/CPP/functions/CPPTopKV.h b/arm_compute/runtime/CPP/functions/CPPTopKV.h
index c94e277312..232cbb3067 100644
--- a/arm_compute/runtime/CPP/functions/CPPTopKV.h
+++ b/arm_compute/runtime/CPP/functions/CPPTopKV.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPTOPKV_H
 #define ARM_COMPUTE_CPPTOPKV_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -39,7 +38,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  predictions A batch_size x classes tensor. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: S32
+     * @param[in]  targets     A batch_size 1D tensor of class ids. Data types supported: U32
      * @param[out] output      Computed precision at @p k as a bool 1D tensor. Data types supported: U8
      * @param[in]  k           Number of top elements to look at for computing precision.
      */
@@ -48,13 +47,14 @@ public:
     /** Static function to check if given info will lead to a valid configuration of @ref CPPTopKVKernel
      *
      * @param[in] predictions A batch_size x classes tensor info. Data types supported: F16/S32/F32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: S32
+     * @param[in] targets     A batch_size 1D tensor info of class ids. Data types supported: U32
      * @param[in] output      Computed precision at @p k as a bool 1D tensor info. Data types supported: U8
      * @param[in] k           Number of top elements to look at for computing precision.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
+    static Status
+    validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPTOPKV_H */
diff --git a/arm_compute/runtime/CPP/functions/CPPUpsample.h b/arm_compute/runtime/CPP/functions/CPPUpsample.h
index 3e1852bc30..3b0f997b17 100644
--- a/arm_compute/runtime/CPP/functions/CPPUpsample.h
+++ b/arm_compute/runtime/CPP/functions/CPPUpsample.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_CPPUPSAMPLE_H
 #define ARM_COMPUTE_CPPUPSAMPLE_H
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
 
 namespace arm_compute
 {
@@ -38,11 +37,11 @@ class CPPUpsample : public ICPPSimpleFunction
 public:
     /** Configure the upsample CPP kernel
      *
-     * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED
-     * @param[out] output The output tensor. Data types supported: Same as @p input
+     * @param[in]  input  The input tensor to upsample. Data types supported: All.
+     * @param[out] output The output tensor. Data types supported: same as @p input
      * @param[in]  info   Padding information
      */
     void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CPPUPSAMPLE_H */
diff --git a/arm_compute/runtime/CPUUtils.h b/arm_compute/runtime/CPUUtils.h
deleted file mode 100644
index 76e48f8985..0000000000
--- a/arm_compute/runtime/CPUUtils.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_RUNTIME_CPU_UTILS_H
-#define ARM_COMPUTE_RUNTIME_CPU_UTILS_H
-
-namespace arm_compute
-{
-class CPUInfo;
-/** This function will try to detect the CPU configuration on the system and will fill
- *  the cpuinfo object accordingly to reflect this.
- *
- * @param[out] cpuinfo @ref CPUInfo to be used to hold the system's cpu configuration.
- */
-void get_cpu_configuration(CPUInfo &cpuinfo);
-/** Some systems have both big and small cores, this fuction computes the minimum number of cores
- *  that are exactly the same on the system. To maximize performance the library attempts to process
- *  workloads concurrently using as many threads as big cores are available on the system.
- *
- * @return The minumum number of common cores.
- */
-unsigned int get_threads_hint();
-}
-#endif /* ARM_COMPUTE_RUNTIME_CPU_UTILS_H */
diff --git a/arm_compute/runtime/DeviceProperties.h b/arm_compute/runtime/DeviceProperties.h
deleted file mode 100644
index 03a5fa0aa8..0000000000
--- a/arm_compute/runtime/DeviceProperties.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_DEVICE_PROPERTIES_H
-#define ARM_COMPUTE_DEVICE_PROPERTIES_H
-
-#include "arm_compute/core/CPP/CPPTypes.h"
-
-namespace arm_compute
-{
-/** Device properties */
-struct DeviceProperties
-{
-    std::string name{ "unknown" };
-    CPUInfo     cpu_info{}; // initialised upon creating in the constructor
-
-    DeviceProperties();
-};
-
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_DEVICE_PROPERTIES_H */
diff --git a/arm_compute/runtime/Distribution1D.h b/arm_compute/runtime/Distribution1D.h
deleted file mode 100644
index a77e33371e..0000000000
--- a/arm_compute/runtime/Distribution1D.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_DISTRIBUTION1D_H
-#define ARM_COMPUTE_DISTRIBUTION1D_H
-
-#include "arm_compute/core/IDistribution1D.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-/** Basic implementation of the 1D distribution interface */
-class Distribution1D : public IDistribution1D
-{
-public:
-    /** Constructor: Creates a 1D Distribution of a consecutive interval [offset, offset + range - 1]
-     *               defined by a start offset and valid range, divided equally into num_bins parts.
-     *
-     * @param[in] num_bins The number of bins the distribution is divided in.
-     * @param[in] offset   The start of the values to use.
-     * @param[in] range    The total number of the consecutive values of the distribution interval.
-     */
-    Distribution1D(size_t num_bins, int32_t offset, uint32_t range);
-
-    // Inherited methods overridden:
-    uint32_t *buffer() const override;
-
-private:
-    mutable std::vector<uint32_t> _data; /**< The distribution data. */
-};
-}
-#endif /* ARM_COMPUTE_DISTRIBUTION1D_H */
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 3706d0859f..c3af17d6f2 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
-#define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
+#define ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,16 +41,73 @@ enum class FFTDirection
 /** Descriptor used by the FFT1D function */
 struct FFT1DInfo
 {
-    unsigned int axis{ 0 };                          /**< Axis to run the FFT on. */
-    FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+    unsigned int axis{0};                          /**< Axis to run the FFT on. */
+    FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
 };
 
 /** Descriptor used by the FFT2D function */
 struct FFT2DInfo
 {
-    unsigned int axis0{ 0 };                         /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
-    unsigned int axis1{ 1 };                         /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
-    FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
+    unsigned int axis0{0}; /**< Axis to run first pass on. If same, multiple transforms are performed on single axis*/
+    unsigned int axis1{1}; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
+    FFTDirection direction{FFTDirection::Forward}; /**< Direction of the FFT. */
+};
+
+/** Descriptor used by the 2d Convolution function */
+struct Conv2dInfo
+{
+    Conv2dInfo() = default;
+
+    Conv2dInfo(const PadStrideInfo       &conv_info,
+               const Size2D              &dilation,
+               const ActivationLayerInfo &act_info,
+               bool                       enable_fast_math,
+               unsigned int               num_groups,
+               const WeightsInfo         &weights_info = WeightsInfo())
+        : conv_info(conv_info),
+          dilation(dilation),
+          act_info(act_info),
+          enable_fast_math(enable_fast_math),
+          num_groups(num_groups),
+          weights_info(weights_info)
+    {
+    }
+
+    PadStrideInfo       conv_info{};
+    Size2D              dilation{1U, 1U};
+    ActivationLayerInfo act_info{};
+    bool                enable_fast_math{false};
+    unsigned int        num_groups{1};
+    WeightsInfo         weights_info{};
 };
+
+/** Descriptor used by the 3d Convolution function */
+struct Conv3dInfo
+{
+    Conv3dInfo() = default;
+
+    Conv3dInfo(const Size3D                &stride,
+               const Padding3D             &padding,
+               const ActivationLayerInfo   &act_info,
+               const Size3D                &dilation,
+               const DimensionRoundingType &round_type,
+               bool                         enable_fast_math)
+        : stride(stride),
+          padding(padding),
+          act_info(act_info),
+          dilation(dilation),
+          round_type(round_type),
+          enable_fast_math(enable_fast_math)
+    {
+    }
+
+    Size3D                stride{1U, 1U, 1U};
+    Padding3D             padding{};
+    ActivationLayerInfo   act_info{};
+    Size3D                dilation{1U, 1U, 1U};
+    DimensionRoundingType round_type{};
+    bool                  enable_fast_math{false};
+};
+
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_FUNCTIONDESCRIPTORS_H
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h b/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h
deleted file mode 100644
index 248c8716f0..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCBufferAllocator.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCBUFFERALLOCATOR_H
-#define ARM_COMPUTE_GCBUFFERALLOCATOR_H
-
-#include "arm_compute/runtime/IAllocator.h"
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** Default GLES buffer allocator implementation */
-class GCBufferAllocator final : public IAllocator
-{
-public:
-    /** Default constructor */
-    GCBufferAllocator() = default;
-
-    // Inherited methods overridden:
-    void *allocate(size_t size, size_t alignment) override;
-    void free(void *ptr) override;
-    std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCBUFFERALLOCATOR_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h b/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
deleted file mode 100644
index 4c0d91c2c9..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCFunctions.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCFUNCTIONS_H
-#define ARM_COMPUTE_GCFUNCTIONS_H
-
-/* Header regrouping all the GLES compute functions */
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h"
-
-#endif /* ARM_COMPUTE_GCFUNCTIONS_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h b/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h
deleted file mode 100644
index 60c2153902..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCHelpers.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GC_HELPERS_H
-#define ARM_COMPUTE_GC_HELPERS_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-
-namespace arm_compute
-{
-/** This function creates an OpenGL-ES context and a display.
- *
- * @return A std::tuple where the first element is the opengl display, the second element is the opengl context
- *         and the third one an error code. The error code will be EGL_TRUE upon successful creation, otherwise
- *         a value telling why the function failed.
- */
-std::tuple<EGLDisplay, EGLContext, EGLBoolean> create_opengl_display_and_context();
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GC_HELPERS_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCMemory.h b/arm_compute/runtime/GLES_COMPUTE/GCMemory.h
deleted file mode 100644
index 0c145e167d..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCMemory.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_RUNTIME_GLES_COMPUTE_GCMEMORY_H
-#define ARM_COMPUTE_RUNTIME_GLES_COMPUTE_GCMEMORY_H
-
-#include "arm_compute/runtime/IMemory.h"
-
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h"
-
-#include <cstddef>
-#include <memory>
-
-namespace arm_compute
-{
-/** GLES implementation of memory object */
-class GCMemory : public IMemory
-{
-public:
-    /** Default Constructor */
-    GCMemory();
-    /** Default Constructor
-     *
-     * @param[in] memory Memory to be imported
-     */
-    GCMemory(const std::shared_ptr<IGCMemoryRegion> &memory);
-    /** Default Constructor
-     *
-     * @note Ownership of the memory is not transferred to this object.
-     *       Thus management (allocate/free) should be done by the client.
-     *
-     * @param[in] memory Memory to be imported
-     */
-    GCMemory(IGCMemoryRegion *memory);
-    /** Allow instances of this class to be copied */
-    GCMemory(const GCMemory &) = default;
-    /** Allow instances of this class to be copy assigned */
-    GCMemory &operator=(const GCMemory &) = default;
-    /** Allow instances of this class to be moved */
-    GCMemory(GCMemory &&) noexcept = default;
-    /** Allow instances of this class to be move assigned */
-    GCMemory &operator=(GCMemory &&) noexcept = default;
-    /** GLES Region accessor
-     *
-     * @return Memory region
-     */
-    IGCMemoryRegion *gc_region();
-    /** GLES Region accessor
-     *
-     * @return Memory region
-     */
-    IGCMemoryRegion *gc_region() const;
-
-    // Inherited methods overridden:
-    IMemoryRegion *region() final;
-    IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
-
-private:
-    IGCMemoryRegion                 *_region;
-    std::shared_ptr<IGCMemoryRegion> _region_owned;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_GLES_COMPUTE_GCMEMORY_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h b/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h
deleted file mode 100644
index 90137c3e60..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCMemoryRegion.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_RUNTIME_GLES_COMPUTE_GC_MEMORY_REGION_H
-#define ARM_COMPUTE_RUNTIME_GLES_COMPUTE_GC_MEMORY_REGION_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/runtime/IMemoryRegion.h"
-
-#include <cstddef>
-
-namespace arm_compute
-{
-/** GLES memory region interface */
-class IGCMemoryRegion : public IMemoryRegion
-{
-public:
-    /** Constructor
-     *
-     * @param[in] size Region size
-     */
-    IGCMemoryRegion(size_t size);
-    /** Default Destructor */
-    virtual ~IGCMemoryRegion() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCMemoryRegion(const IGCMemoryRegion &) = delete;
-    /** Default move constructor */
-    IGCMemoryRegion(IGCMemoryRegion &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    IGCMemoryRegion &operator=(const IGCMemoryRegion &) = delete;
-    /** Default move assignment operator */
-    IGCMemoryRegion &operator=(IGCMemoryRegion &&) = default;
-    /** Returns the underlying CL buffer
-     *
-     * @return CL memory buffer object
-     */
-    const GLuint &gc_ssbo_name() const;
-    /** Host/SVM pointer accessor
-     *
-     * @return Host/SVM pointer base
-     */
-    virtual void *ptr() = 0;
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
-     */
-    virtual void *map(bool blocking) = 0;
-    /** Enqueue an unmap operation of the allocated buffer on the given queue.
-     *
-     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     */
-    virtual void unmap() = 0;
-
-    // Inherited methods overridden :
-    void       *buffer() override;
-    const void *buffer() const override;
-
-protected:
-    void *_mapping;
-    GLuint _ssbo_name;
-};
-
-/** GLES buffer memory region implementation */
-class GCBufferMemoryRegion final : public IGCMemoryRegion
-{
-public:
-    /** Constructor
-     *
-     * @param[in] size Region size
-     */
-    GCBufferMemoryRegion(size_t size);
-    /** Destructor */
-    ~GCBufferMemoryRegion();
-
-    // Inherited methods overridden :
-    void *ptr() final;
-    void *map(bool blocking) final;
-    void                           unmap() final;
-    std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) final;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_RUNTIME_GLES_COMPUTE_GC_MEMORY_REGION_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h b/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h
deleted file mode 100644
index 1b7514ff34..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCRuntimeContext.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCRUNTIME_CONTEXT_H
-#define ARM_COMPUTE_GCRUNTIME_CONTEXT_H
-
-#include "arm_compute/core/GLES_COMPUTE/GCCoreRuntimeContext.h"
-#include "arm_compute/core/GLES_COMPUTE/GCKernelLibrary.h"
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
-#include "arm_compute/runtime/IScheduler.h"
-#include "arm_compute/runtime/RuntimeContext.h"
-
-namespace arm_compute
-{
-/** Runtime context */
-class GCRuntimeContext : public RuntimeContext
-{
-public:
-    /** Default Constructor */
-    GCRuntimeContext();
-    /** Destructor */
-    ~GCRuntimeContext() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCRuntimeContext(const GCRuntimeContext &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCRuntimeContext &operator=(const GCRuntimeContext &) = delete;
-    /** CPU Scheduler setter */
-    void set_gpu_scheduler(GCScheduler *scheduler);
-
-    // Inherited overridden methods
-    GCScheduler          *gpu_scheduler();
-    GCKernelLibrary      &kernel_library();
-    GCCoreRuntimeContext *core_runtime_context();
-
-private:
-    std::unique_ptr<GCScheduler> _gpu_owned_scheduler{ nullptr };
-    GCScheduler                 *_gpu_scheduler{ nullptr };
-    GCKernelLibrary              _kernel_lib{};
-    GCCoreRuntimeContext         _core_context{};
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCRUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h b/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
deleted file mode 100644
index 5aac6b88dc..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCScheduler.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCSCHEDULER_H
-#define ARM_COMPUTE_GCSCHEDULER_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class IGCKernel;
-
-/** Provides global access to a OpenGL ES context and command queue. */
-class GCScheduler final
-{
-public:
-    /** Constructor */
-    GCScheduler();
-    /** Destructor */
-    ~GCScheduler();
-    /** Prevent instances of this class from being copied */
-    GCScheduler(const GCScheduler &) = delete;
-    /** Prevent instances of this class from being copied */
-    GCScheduler &operator=(const GCScheduler &) = delete;
-    /** Access the scheduler singleton.
-     *
-     * @return The scheduler
-     */
-    static GCScheduler &get();
-    /** Initialises the context and command queue used by the scheduler to default values
-     *  and sets a default device and kernel path for the @ref GCKernelLibrary.
-     */
-    void default_init();
-    /** Initializes the context and display used by the Scheduler.
-     *
-     * @param[in] display Display to use
-     * @param[in] ctx     Context to use
-     */
-    void default_init_with_context(EGLDisplay display, EGLContext ctx);
-    /** Schedule the execution of the passed kernel if possible.
-     *
-     * @param[in] kernel Kernel to execute.
-     * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
-     */
-    void dispatch(IGCKernel &kernel, bool flush = true);
-    /** Initialises the display and context to be used by the scheduler.
-     *
-     * @param[in] dpy The EGL display connection
-     * @param[in] ctx The EGL rendering context
-     */
-    void init(EGLDisplay dpy, EGLContext ctx);
-    /** Defines a barrier ordering memory transactions. */
-    void memory_barrier();
-    /** Get the target GPU.
-     *
-     * @return The target GPU.
-     */
-    GPUTarget get_target() const
-    {
-        return _target;
-    }
-    /** Accessor to set target GPU to be used by the scheduler.
-     *
-     * @param[in] target The target GPU.
-     */
-    void set_target(GPUTarget target)
-    {
-        _target = target;
-    }
-
-private:
-    /** Set up EGL context */
-    void setup_context();
-
-    /** Flag to ensure symbols initialisation is happening before Scheduler creation */
-    static std::once_flag _initialize_symbols;
-
-    EGLDisplay _display; /**< Underlying EGL Display. */
-    EGLContext _context; /**< Underlying EGL Context. */
-
-    GPUTarget _target;
-};
-}
-#endif /* ARM_COMPUTE_GCSCHEDULER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensor.h b/arm_compute/runtime/GLES_COMPUTE/GCTensor.h
deleted file mode 100644
index 0565950fe0..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCTensor.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCTENSOR_H
-#define ARM_COMPUTE_GCTENSOR_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCTensor.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h"
-
-namespace arm_compute
-{
-class ITensorAllocator;
-class ITensorInfo;
-class IRuntimeContext;
-
-/** Interface for OpenGL ES tensor */
-class GCTensor : public IGCTensor, public IMemoryManageable
-{
-public:
-    /** Default constructor
-     *
-     * @param[in] ctx (Optional) Pointer to the runtime context.
-     *
-     */
-    GCTensor(IRuntimeContext *ctx = nullptr);
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCTensor(const GCTensor &) = delete;
-
-    /** Prevent instances of this class from being copy assigned (As this class contains pointers) */
-    GCTensor &operator=(const GCTensor &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCTensor(GCTensor &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCTensor &operator=(GCTensor &&) = default;
-
-    /** Virtual destructor */
-    virtual ~GCTensor() = default;
-
-    /** Return a pointer to the tensor's allocator
-     *
-     * @return A pointer to the tensor's allocator
-     */
-    ITensorAllocator *allocator();
-
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in] blocking (Optional) If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
-     */
-    void map(bool blocking = true);
-
-    /** Enqueue an unmap operation of the allocated and mapped buffer on the given queue.
-     *
-     * @note This method simply enqueues the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     */
-    void unmap();
-
-    // Inherited methods overridden:
-    TensorInfo *info() const override;
-    TensorInfo *info() override;
-    uint8_t    *buffer() const override;
-    GLuint      gc_buffer() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
-
-protected:
-    // Inherited methods overridden:
-    uint8_t *do_map(bool blocking) override;
-    void do_unmap() override;
-
-private:
-    mutable GCTensorAllocator _allocator; /**< Instance of the OpenGL ES tensor allocator */
-};
-
-/** OpenGL ES Image */
-using GCImage = GCTensor;
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_GCTENSOR_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h b/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h
deleted file mode 100644
index ee9861cbd1..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/GCTensorAllocator.h
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCTENSORALLOCATOR_H
-#define ARM_COMPUTE_GCTENSORALLOCATOR_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCMemory.h"
-#include "arm_compute/runtime/ITensorAllocator.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class GCTensor;
-
-/** Basic implementation of a GLES memory tensor allocator. */
-class GCTensorAllocator : public ITensorAllocator
-{
-public:
-    /** Default constructor. */
-    GCTensorAllocator(IMemoryManageable *owner = nullptr);
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCTensorAllocator(const GCTensorAllocator &) = delete;
-
-    /** Prevent instances of this class from being copy assigned (As this class contains pointers) */
-    GCTensorAllocator &operator=(const GCTensorAllocator &) = delete;
-
-    /** Allow instances of this class to be moved */
-    GCTensorAllocator(GCTensorAllocator &&) = default;
-
-    /** Allow instances of this class to be moved */
-    GCTensorAllocator &operator=(GCTensorAllocator &&) = default;
-
-    /** Default destructor */
-    ~GCTensorAllocator() = default;
-
-    /** Interface to be implemented by the child class to return the pointer to the mapped data.
-     *
-     * @return a pointer to the data.
-     */
-    uint8_t *data();
-
-    /** Get the OpenGL ES buffer object name
-     *
-     * @return The buffer object name
-     */
-    GLuint get_gl_ssbo_name() const;
-
-    /** Enqueue a map operation of the allocated buffer on the given queue.
-     *
-     * @param[in] blocking If true, then the mapping will be ready to use by the time
-     *                     this method returns, else it is the caller's responsibility
-     *                     to flush the queue and wait for the mapping operation to have completed before using the returned mapping pointer.
-     *
-     * @return The mapping address.
-     */
-    uint8_t *map(bool blocking);
-
-    /** Enqueue an unmap operation of the allocated buffer on the given queue.
-     *
-     * @note This method simply enqueue the unmap operation, it is the caller's responsibility to flush the queue and make sure the unmap is finished before
-     *       the memory is accessed by the device.
-     *
-     */
-    void unmap();
-
-    /** Allocate size specified by TensorInfo of GLES memory.
-     *
-     * @note: The tensor must not already be allocated when calling this function.
-     *
-     */
-    void allocate() override;
-
-    /** Free allocated GLES memory.
-     *
-     * @note The tensor must have been allocated when calling this function.
-     *
-     */
-    void free() override;
-
-    /** Associates the tensor with a memory group
-     *
-     * @param[in] associated_memory_group Memory group to associate the tensor with
-     */
-    void set_associated_memory_group(IMemoryGroup *associated_memory_group);
-
-protected:
-    /** Call map() on the SSBO.
-     *
-     * @return A pointer to the beginning of the tensor's allocation.
-     */
-    uint8_t *lock() override;
-
-    /** Call unmap() on the SSBO. */
-    void unlock() override;
-
-private:
-    IMemoryManageable *_owner;                   /**< Owner of the allocator */
-    IMemoryGroup      *_associated_memory_group; /**< Registered memory group */
-    GCMemory           _memory;                  /**< OpenGL ES memory */
-    uint8_t           *_mapping;                 /**< Pointer to the CPU mapping of the OpenGL ES buffer. */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCTENSORALLOCATOR_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
deleted file mode 100644
index 40e0c1a752..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCAbsoluteDifference.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H
-#define ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H
-
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref GCAbsoluteDifferenceKernel
- *
- * @note The tensor data types for the inputs must be U8.
- * @note The function calculates the absolute difference also when the 2 inputs have different tensor data types.
- */
-class GCAbsoluteDifference : public IGCSimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input1 First input tensor. Data types supported: U8
-     * @param[in]  input2 Second input tensor. Data types supported: U8
-     * @param[out] output Output tensor. Data types supported: U8
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output);
-};
-}
-
-#endif /* ARM_COMPUTE_GCABSOLUTEDIFFERENCE_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
deleted file mode 100644
index fc7a0a0134..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCACTIVATIONLAYER_H
-#define ARM_COMPUTE_GCACTIVATIONLAYER_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class IGCTensor;
-
-/** Basic function to run @ref GCActivationLayerKernel
- *
- * @note The function simulates an activation layer with the specified activation function.
- */
-class GCActivationLayer : public IGCSimpleFunction
-{
-public:
-    /** Constructor
-     *
-     * @param[in, out] ctx Runtime context to be used by the function
-     */
-    explicit GCActivationLayer(GCRuntimeContext *ctx = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCActivationLayer(const GCActivationLayer &) = delete;
-    /** Default move constructor */
-    GCActivationLayer(GCActivationLayer &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCActivationLayer &operator=(const GCActivationLayer &) = delete;
-    /** Default move assignment operator */
-    GCActivationLayer &operator=(GCActivationLayer &&) = default;
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr, the activation function will be performed in-place
-     *
-     * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                          of the activation function. Data types supported: F16/F32.
-     * @param[out]     output   Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info Activation layer parameters.
-     */
-    void configure(IGCTensor *input, IGCTensor *output, ActivationLayerInfo act_info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
deleted file mode 100644
index 65bbacf272..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCArithmeticAddition.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCARITHMETICADDITION_H
-#define ARM_COMPUTE_GCARITHMETICADDITION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to run @ref GCArithmeticAdditionKernel
- *
- * @note The tensor data type for the inputs must be F16.
- * @note The function performs an arithmetic addition between two tensors.
- */
-class GCArithmeticAddition : public IGCSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and convertion policy.
-     *
-     * @param[in]  input1   First tensor input. Data types supported: F16.
-     * @param[in]  input2   Second tensor input. Data types supported: F16.
-     * @param[out] output   Output tensor. Data types supported: F16.
-     * @param[in]  policy   Policy to use to handle overflow.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCArithmeticAddition
-     *
-     * @param[in] input1   First tensor input info. Data types supported: F16.
-     * @param[in] input2   Second tensor input info. Data types supported: F16.
-     * @param[in] output   Output tensor info. Data types supported: F16.
-     * @param[in] policy   Policy to use to handle overflow.
-     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-}
-#endif /* ARM_COMPUTE_GCARITHMETICADDITION_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
deleted file mode 100644
index 2d0aff53b6..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCBatchNormalizationLayer.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H
-#define ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCBatchNormalizationLayerKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to run @ref GCBatchNormalizationLayerKernel and simulate a batch normalization layer.
- *
- * Batch normalization is calculated by:
- * @f[ out_i = \gamma * (\frac{in_i - \mu_{B}}{\sqrt{\sigma^2_{B} + \epsilon}}) + \beta \equiv BN_{\gamma,\beta}(in_i) @f]
- *
- */
-class GCBatchNormalizationLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCBatchNormalizationLayer();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input    Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                      The rest are optional and used for representing batches. Data types supported: F16/F32.
-     * @param[out] output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
-     * @param[in]  mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  beta     Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  gamma    Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
-     * @param[in]  epsilon  Small value to avoid division with zero.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *var, const IGCTensor *beta, const IGCTensor *gamma, float epsilon,
-                   ActivationLayerInfo act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    GCBatchNormalizationLayerKernel _norm_kernel; /**< BatchNormalization layer kernel to run */
-};
-}
-#endif /* ARM_COMPUTE_GCBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
deleted file mode 100644
index eb25dbfc9a..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConcatenateLayer.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDEPTHCONCATENATELAYER_H
-#define ARM_COMPUTE_GCDEPTHCONCATENATELAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthConcatenateLayerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-#include <vector>
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * @note only axis z is supported
- * -# @ref GCDepthConcatenateLayerKernel
- */
-class GCConcatenateLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCConcatenateLayer();
-    /** Initialise the kernel's inputs vector and output.
-     *
-     * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     *
-     * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: F16/F32.
-     * @param[out]    output        Output tensor. Data types supported: Same as @p input.
-     * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis is 2.
-     */
-    void configure(std::vector<IGCTensor *> inputs_vector, IGCTensor *output, size_t axis);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    std::vector<std::unique_ptr<IGCKernel>> _concat_kernels;
-    unsigned int                            _num_inputs;
-    unsigned int                            _axis;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCDEPTHCONCATENATELAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
deleted file mode 100644
index 311b3e273b..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_GCCONVOLUTIONLAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCCol2ImKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Function to reshape and transpose the weights. This function calls the following kernels:
- * -# @ref GCWeightsReshapeKernel
- */
-class GCConvolutionLayerReshapeWeights : public IFunction
-{
-public:
-    /** Constructor */
-    GCConvolutionLayerReshapeWeights();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: F16/F32.
-     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] output  Destination tensor. Data types supported: Same as @p weights.
-     */
-    void configure(const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output);
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    GCWeightsReshapeKernel _weights_reshape_kernel;
-};
-
-/** Basic function to compute the convolution layer. This function calls the following GLES kernels:
- *
- * -# @ref GCWeightsReshapeKernel (executed only once for each configuration)
- * -# @ref GCGEMMTranspose1xWKernel (executed only once for each configuration)
- * -# @ref GCIm2ColKernel
- * -# @ref GCGEMMInterleave4x4Kernel
- * -# @ref GCCol2ImKernel
- */
-class GCConvolutionLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCConvolutionLayer(const GCConvolutionLayer &) = delete;
-    /** Default move constructor */
-    GCConvolutionLayer(GCConvolutionLayer &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCConvolutionLayer &operator=(const GCConvolutionLayer &) = delete;
-    /** Default move assignment operator */
-    GCConvolutionLayer &operator=(GCConvolutionLayer &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with GCWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with GCGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
-     */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    /** Configures the appropriate matrix multiply routine
-     *
-     * @param input                     Input tensor. Data types supported: F16/F32.
-     * @param weights                   Weights tensor. Data type supported: Same as @p input.
-     * @param output                    Output tensor. Data types supported: Same as @p input,
-     */
-    void configure_mm(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref GCGEMMConvolutionLayer matrix multiply routines
-     *
-     * @param[in] input   Input tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in] weights Weights tensor. Data type supported: Same as @p input.
-     * @param[in] output  Output tensor. Data types supported: Same as @p input,
-     *                                      except for input of QASYMM8 type where output should be of S32 type.
-     *
-     * @return a status
-     */
-    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output);
-
-private:
-    MemoryGroup                      _memory_group;
-    GCConvolutionLayerReshapeWeights _reshape_weights;
-    GCIm2ColKernel                   _input_im2col_kernel;
-    GCGEMM                           _mm_gemm;
-    GCCol2ImKernel                   _output_col2im_kernel;
-    GCFillBorderKernel               _fill_border;
-    GCActivationLayer                _activationlayer_function;
-
-    const IGCTensor *_original_weights;
-
-    GCTensor _input_im2col_reshaped;
-    GCTensor _input_interleaved_reshaped;
-    GCTensor _weights_reshaped;
-    GCTensor _weights_transposed;
-    GCTensor _gemm_output;
-    GCTensor _tmp_output;
-
-    bool _is_activationlayer_enabled;
-    bool _is_prepared;
-};
-}
-
-#endif /* ARM_COMPUTE_GCCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
deleted file mode 100644
index e3f2c38446..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDepthwiseConvolutionLayer.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDEPTHWISECONVOLUTION_H
-#define ARM_COMPUTE_GCDEPTHWISECONVOLUTION_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to execute a depthwise convolution for kernel size 3x3xC. This function calls the following OpenGLES kernels:
- *
- * -# @ref GCDepthwiseConvolutionLayer3x3Kernel
- * -# @ref GCFillBorderKernel (if pad_x or pad_y > 0)
- *
- */
-class GCDepthwiseConvolutionLayer3x3 : public IFunction
-{
-public:
-    /** Default constructor */
-    GCDepthwiseConvolutionLayer3x3();
-    /** Initialize the function's source, destination, conv and border_size.
-     *
-     * @param[in, out] input            Source tensor. Data type supported: F16. (Written to only for border filling).
-     * @param[in]      weights          Weights tensor. A 3D tensor with shape [3, 3, IFM]. Data type supported: Same as @p input.
-     * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                                  Data type supported: Same as @p input.
-     * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]      conv_info        Padding and stride information to use for the convolution.
-     * @param[in]      depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). Currently supports (1,1) only.
-     */
-    void configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
-    // Inherited methods overridden:
-    void run() override final;
-
-private:
-    std::unique_ptr<IGCKernel> _kernel;
-    GCFillBorderKernel         _border_handler;
-    GCTensorShiftKernel        _shift_handler;
-    GCActivationLayer          _activationlayer_function;
-
-    bool _is_activationlayer_enabled;
-};
-}
-#endif /*ARM_COMPUTE_GCDEPTHWISECONVOLUTION_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
deleted file mode 100644
index cb62fef1d3..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDirectConvolutionLayer.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/functions/GCActivationLayer.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to execute direct convolution function. This function calls the following kernels:
- *
- * -# @ref GCDirectConvolutionLayerKernel
- * -# @ref GCFillBorderKernel
- * -# @ref GCTensorShiftKernel
- *
- * @note Supported kernel size: 1x1, 3x3, and 5x5
- * @note This OpenGL ES implementation works with stride_x = 1 and 2
- */
-class GCDirectConvolutionLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCDirectConvolutionLayer();
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: F16/F32.
-     *                          input will be written to only if it is currently left aligned.
-     * @param[in]     weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]     biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
-     * @param[out]    output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]     conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]     act_info  (Optional) Activation layer information in case of a fused activation.
-     */
-    void configure(IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
-
-    // Inherited methods overridden:
-    void run() override final;
-
-private:
-    std::unique_ptr<IGCKernel> _kernel;
-    GCFillBorderKernel         _border_handler;
-    GCTensorShiftKernel        _shift_handler;
-};
-}
-#endif /* ARM_COMPUTE_GCDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
deleted file mode 100644
index 2152616dd9..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCDropoutLayer.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCDROPOUTLAYER_H
-#define ARM_COMPUTE_GCDROPOUTLAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCDropoutLayerKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-/** Basic function to do dropout op. This function calls the following kernels:
- *
- *  -# @ref GCDropoutLayerKernel
- */
-class GCDropoutLayer : public IFunction
-{
-public:
-    /** Constructor */
-    GCDropoutLayer();
-
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input   Source tensor. Data type supported: F16/F32.
-     * @param[out] mask    Destination tensor. Data type supported: Same as @p input.
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  ratio   Dropout ratio
-     * @param[in]  forward Forward or backward propagation
-     *
-     */
-    void configure(const IGCTensor *input, IGCTensor *mask, IGCTensor *output, float ratio, bool forward);
-
-    //Inherited methods override
-    void run() override;
-
-private:
-    GCDropoutLayerKernel _dropout_kernel;
-};
-}
-
-#endif /* ARM_COMPUTE_GCDROPOUTLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
deleted file mode 100644
index 87bb22db94..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFillBorder.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCFILLBORDER_H
-#define ARM_COMPUTE_GCFILLBORDER_H
-
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref GCFillBorderKernel */
-class GCFillBorder : public IGCSimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in,out] tensor                Source tensor. Data types supported: F16/F32
-     * @param[in]     border_width          The border width
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(IGCTensor *tensor, unsigned int border_width, BorderMode border_mode,
-                   const PixelValue &constant_border_value = PixelValue());
-};
-}
-
-#endif /*ARM_COMPUTE_FILLBORDER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
deleted file mode 100644
index 60967e9668..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCFullyConnectedLayer.h
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H
-#define ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCIm2ColKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTransposeKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-/** Basic function to reshape the weights of Fully Connected layer with OpenGL ES. This function calls the following kernels:
- *
- *  -# @ref GCTransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class GCFullyConnectedLayerReshapeWeights : public IGCSimpleFunction
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: F16/F32.
-     * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-};
-
-/** Basic function to compute a Fully Connected layer on OpenGL ES. This function calls the following OpenGL ES kernels:
- *
- *  -# @ref GCIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref GCFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref GCGEMMMatrixMultiplyKernel
- *  -# @ref GCGEMMMatrixAccumulateBiasesKernel (if @p biases is not equal to nullptr)
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class GCFullyConnectedLayer : public IFunction
-{
-public:
-    /** Constructor */
-    GCFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCFullyConnectedLayer(const GCFullyConnectedLayer &) = delete;
-    /** Default move constructor */
-    GCFullyConnectedLayer(GCFullyConnectedLayer &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCFullyConnectedLayer &operator=(const GCFullyConnectedLayer &) = delete;
-    /** Default move assignment operator */
-    GCFullyConnectedLayer &operator=(GCFullyConnectedLayer &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input   Source tensor. Data type supported: F16/F32.
-     * @param[in]  weights Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
-     * @param[in]  biases  Bias tensor. It can be nullptr. Data type supported:Same as @p input.
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
-     * @param[in]  fc_info (Optional) Fully connected layer additional info
-     */
-    void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
-
-    //Inherited methods override
-    void run() override;
-    void prepare() override;
-
-private:
-    void configure_fc_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
-    void configure_conv_fc(const IGCTensor *input, const IGCTensor *weights, IGCTensor *output);
-
-    MemoryGroup                         _memory_group;
-    IWeightsManager                    *_weights_manager;
-    GCIm2ColKernel                      _im2col_kernel;
-    GCFullyConnectedLayerReshapeWeights _reshape_weights_kernel;
-    GCGEMMMatrixMultiplyKernel          _mm_kernel;
-    GCGEMMMatrixAccumulateBiasesKernel  _accumulate_biases_kernel;
-    GCTensor                            _im2col_output;
-    GCTensor                            _reshape_weights_output;
-    const IGCTensor                    *_original_weights;
-    bool                                _are_weights_reshaped;
-    bool                                _is_fc_after_conv;
-    bool                                _accumulate_biases;
-};
-}
-#endif /* ARM_COMPUTE_GCFULLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
deleted file mode 100644
index 4fc621eb13..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMM.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef ARM_COMPUTE_GCGEMM_H
-#define ARM_COMPUTE_GCGEMM_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCGEMMTranspose1xWKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to execute GEMM on OpenGLES Compute. This function calls the following kernels:
- *
- *  -# @ref GCGEMMInterleave4x4Kernel (if the output tensor is a matrix)
- *  -# @ref GCGEMMTranspose1xWKernel (if the output tensor is a matrix)
- *  -# @ref GCGEMMMatrixMultiplyKernel
- *  -# @ref GCGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0)
- *
- */
-class GCGEMM : public IFunction
-{
-public:
-    /** Default constructor. */
-    GCGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMM(const GCGEMM &) = delete;
-    /** Default move constructor */
-    GCGEMM(GCGEMM &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    GCGEMM &operator=(const GCGEMM &) = delete;
-    /** Default move assignment operator */
-    GCGEMM &operator=(GCGEMM &&) = default;
-    /** Initialise the kernel's inputs and output
-     *
-     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     *
-     * @note All tensors must have the same data type.
-     *
-     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
-     *
-     * @param[in]  a         First input tensor  (Matrix or Vector A). Data types supported: F32
-     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
-     */
-    void configure(const IGCTensor *a, const IGCTensor *b, const IGCTensor *c, IGCTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCGEMM.
-     *
-     * @param[in]  a         First input tensor  (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const IGCTensor *c, const ITensorInfo *output, const float alpha, const float beta, const GEMMInfo &gemm_info = GEMMInfo());
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    MemoryGroup                _memory_group;
-    GCGEMMInterleave4x4Kernel  _interleave_kernel;
-    GCGEMMTranspose1xWKernel   _transpose_kernel;
-    GCGEMMMatrixMultiplyKernel _mm_kernel;
-    GCGEMMMatrixAdditionKernel _ma_kernel;
-    GCTensor                   _tmp_a;
-    GCTensor                   _tmp_b;
-    const IGCTensor           *_original_b;
-    bool                       _is_interleaved_transposed;
-    bool                       _run_addition;
-    bool                       _reshape_b_only_on_first_run;
-    bool                       _is_prepared;
-};
-}
-
-#endif /* ARM_COMPUTE_GCGEMM_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
deleted file mode 100644
index d0e71b4e76..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMInterleave4x4.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H
-#define ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H
-
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute GCGEMMInterleave4x4Kernel. This function calls the following OpenGL ES kernel:
- *
- *  -# @ref GCGEMMInterleave4x4Kernel
- *
- */
-class GCGEMMInterleave4x4 : public IGCSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input  First input tensor. Data types supported: F32, F16
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-};
-}
-
-#endif /* ARM_COMPUTE_GCGEMMINTERLEAVE4X4_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
deleted file mode 100644
index 782052bb65..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCGEMMTranspose1xW.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H
-#define ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H
-
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-/** Basic function to execute GCGEMMTranspose1xWKernel. This function calls the following OpenGLES kernels:
- *
- *  -# @ref GCGEMMTranspose1xWKernel
- *
- */
-class GCGEMMTranspose1xW : public IGCSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input  First input tensor. Data type supported: F32
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-};
-}
-#endif /*ARM_COMPUTE_GCGEMMTRANSPOSE1XW_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
deleted file mode 100644
index 8d511aca7a..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizationLayer.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCNORMALIZATIONLAYER_H
-#define ARM_COMPUTE_GCNORMALIZATIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizationLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPixelWiseMultiplicationKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class IGCTensor;
-
-/** Basic function to compute a normalization layer. This function calls the following OpenGL ES kernels:
- *
- * -# @ref GCPixelWiseMultiplicationKernel
- * -# @ref GCFillBorderKernel
- * -# @ref GCNormalizationLayerKernel
- */
-class GCNormalizationLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                       and an optional 4th dimension for batch of inputs. Data types supported: F32. Number of channels must be 1.
-     * @param[out] output    Destination tensor. Dimensions, data type and number of channels must match the input ones.
-     * @param[in]  norm_info Normalization layer information like the normalization type, normalization size and other parameters.
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const NormalizationLayerInfo &norm_info);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                     _memory_group;
-    GCTensor                        _squared_input;   /**< The intermediate buffer which stores results of squaring input*/
-    GCNormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel to run */
-    GCPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel to run */
-    GCFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
deleted file mode 100644
index cd35c73403..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCNormalizePlanarYUVLayer.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYER_H
-#define ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCNormalizePlanarYUVLayerKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to run @ref GCNormalizePlanarYUVLayerKernel
- *
- *  @note The function simulates a NormalizePlanarYUV layer.
- */
-class GCNormalizePlanarYUVLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    GCNormalizePlanarYUVLayer();
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: F16.
-     * @param[out] output Destination tensor. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, const IGCTensor *mean, const IGCTensor *std);
-    /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayer
-     *
-     * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
-     *                    Data types supported: F16/F32.
-     * @param[out] output Destination tensor info. Data type supported: same as @p input
-     * @param[in]  mean   Mean values tensor info. 1 dimension with size equal to the number of input channels. Data types supported: same as @p input
-     * @param[in]  std    Standard deviation values tensor info. 1 dimension with size equal to the number of input channels.
-     *                    Data types supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    GCNormalizePlanarYUVLayerKernel _norm_kernel; /**< NormalizePlanarYUV layer kernel to run */
-};
-}
-#endif /* ARM_COMPUTE_GCNORMALIZEPLANARYUVLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
deleted file mode 100644
index 201e131dd9..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPixelWiseMultiplication.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H
-#define ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to run @ref GCPixelWiseMultiplicationKernel. */
-class GCPixelWiseMultiplication : public IGCSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and convertion policy.
-     *
-     * @param[in]  input1   First tensor input. Data types supported: F32.
-     * @param[in]  input2   Second tensor input. Data types supported: Same as @p input1.
-     * @param[out] output   Output tensor. Data types supported: Same as @p input1.
-     * @param[in]  scale    Scale to apply after multiplication. Must be a positive value.
-     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
-     */
-    void configure(const IGCTensor *input1, const IGCTensor *input2, IGCTensor *output, float scale, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-};
-}
-#endif /*ARM_COMPUTE_GCPIXELWISEMULTIPLICATION_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
deleted file mode 100644
index b29f808f99..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCPoolingLayer.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCPOOLINGLAYER_H
-#define ARM_COMPUTE_GCPOOLINGLAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/IGCKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCFillBorderKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCPoolingLayerKernel.h"
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenGL ES kernels:
- *
- * -# @ref GCFillBorderKernel (executed if padding size is different from zero)
- * -# @ref GCPoolingLayerKernel
- */
-class GCPoolingLayer : public IFunction
-{
-public:
-    GCPoolingLayer();
-    /** Set the input and output tensors.
-     *
-     * @param[in,out] input     Source tensor. (Written to only when padding != 0) Data types supported: F16/F32.
-     * @param[out]    output    Destination tensor. Data types supported: Same as @p input.
-     * @param[in]     pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[out]    indices   (optional) The indices of the maximal values. Data type supported: U32.
-     */
-    void configure(IGCTensor *input, IGCTensor *output, const PoolingLayerInfo &pool_info, IGCTensor *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref GCPoolingLayer
-     *
-     * @param[in] input     Source tensor info. Data types supported: F16/F32.
-     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
-
-    void run() override final;
-
-private:
-    std::unique_ptr<IGCKernel> _kernel;
-    GCFillBorderKernel         _border_handler;
-    GCTensorShiftKernel        _shift_handler;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
deleted file mode 100644
index d4295d57fa..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCScale.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCSCALE_H
-#define ARM_COMPUTE_GCSCALE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to run @ref GCScaleKernel */
-class GCScale : public IGCSimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data types supported: F16. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor. Data types supported: Same as @p input
-     *                                      All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]     policy                The interpolation type.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]     sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]     use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]     align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    void configure(IGCTensor *input, IGCTensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
-};
-}
-#endif /*ARM_COMPUTE_GCSCALE_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
deleted file mode 100644
index 33faae5e06..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCSoftmaxLayer.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCSOFTMAXLAYER_H
-#define ARM_COMPUTE_GCSOFTMAXLAYER_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCSoftmaxLayerKernel.h"
-#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to compute a SoftmaxLayer.
- *
- * Softmax is calculated by :
- * @f[ out = exp(x - max(x)) / sum(exp(x - max(x))) @f]
- *
- * This function runs the following kernels:
- * -# @ref GCLogits1DMaxKernel
- * -# @ref GCLogits1DShiftExpSumKernel
- * -# @ref GCLogits1DNormKernel
- */
-class GCSoftmaxLayer : public IFunction
-{
-public:
-    /** Constructor */
-    GCSoftmaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: F16/F32
-     * @param[out] output Destination tensor. Data types supported: same as @p input
-     * @param[in]  beta   (Optional) A scaling factor for the exponent. Only beta = 1 is supported
-     * @param[in]  axis   (Optional) Reduction axis. It has the purpose of squashing the first @p axis
-     *                    dimensions together. For instance, given a [4x4x4x4] image,
-     *                    when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
-     *
-     * @note The value of @p axis must be always 1 for GLES
-     */
-    void configure(const IGCTensor *input, IGCTensor *output, float beta = 1.0f, size_t axis = 1);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                 _memory_group;
-    GCLogits1DMaxKernel         _max_kernel;
-    GCLogits1DShiftExpSumKernel _shift_exp_sum_kernel;
-    GCLogits1DNormKernel        _norm_kernel;
-    GCTensor                    _max;
-    GCTensor                    _sum;
-    GCTensor                    _tmp;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_GCSOFTMAXLAYER_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
deleted file mode 100644
index a1ac313c52..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCTensorShift.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCTENSORSHIFT_H
-#define ARM_COMPUTE_GCTENSORSHIFT_H
-
-#include "arm_compute/core/GLES_COMPUTE/kernels/GCTensorShiftKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to execute shift function for tensor. This function applies to fix alignment issue on OpenGL ES:
- *
- * @note This alignment issue is introduced by limits of compute shader which requires 32/64/128bit alignment for data access on OpenGL ES
- */
-class GCTensorShift : public IGCSimpleFunction
-{
-public:
-    /** Initialise the kernel's input, output.
-     *
-     * @param[in,out] input Source tensor. Data types supported: F16/F32.
-     */
-    void configure(IGCTensor *input);
-};
-}
-#endif /* ARM_COMPUTE_GCTENSORSHIFT_H */
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
deleted file mode 100644
index 633facf073..0000000000
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCTranspose.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_GCTRANSPOSE_H
-#define ARM_COMPUTE_GCTRANSPOSE_H
-
-#include "arm_compute/runtime/GLES_COMPUTE/IGCSimpleFunction.h"
-
-namespace arm_compute
-{
-class IGCTensor;
-
-/** Basic function to transpose a matrix on OpenGL ES. This function calls the following OpenGL ES kernel:
- *
- *  -# @ref GCTransposeKernel
- *
- */
-class GCTranspose : public IGCSimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     */
-    void configure(const IGCTensor *input, IGCTensor *output);
-};
-}
-
-#endif /* ARM_COMPUTE_GCTRANSPOSE_H */
diff --git a/arm_compute/runtime/HOG.h b/arm_compute/runtime/HOG.h
deleted file mode 100644
index 60581319b9..0000000000
--- a/arm_compute/runtime/HOG.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_HOG_H
-#define ARM_COMPUTE_HOG_H
-
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/Types.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** CPU implementation of HOG data-object */
-class HOG : public IHOG
-{
-public:
-    /** Default constructor */
-    HOG();
-    /** Allocate the HOG descriptor using the given HOG's metadata
-     *
-     * @param[in] input HOG's metadata used to allocate the HOG descriptor
-     */
-    void init(const HOGInfo &input);
-
-    // Inherited method overridden:
-    const HOGInfo *info() const override;
-    float         *descriptor() const override;
-
-private:
-    HOGInfo                    _info;
-    mutable std::vector<float> _descriptor;
-};
-}
-#endif /* ARM_COMPUTE_HOG_H */
diff --git a/arm_compute/runtime/IAllocator.h b/arm_compute/runtime/IAllocator.h
index c24eb33887..f8446db811 100644
--- a/arm_compute/runtime/IAllocator.h
+++ b/arm_compute/runtime/IAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@ class IAllocator
 public:
     /** Default virtual destructor. */
     virtual ~IAllocator() = default;
-    // TODO (COMPMID-1088) : Change allocator and rest interfaces to use IMemoryRegion
     /** Interface to be implemented by the child class to allocate bytes
      *
      * @param[in] size      Size to allocate
@@ -57,5 +56,5 @@ public:
      */
     virtual std::unique_ptr<IMemoryRegion> make_region(size_t size, size_t alignment) = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IALLOCATOR_H */
diff --git a/arm_compute/runtime/IAssetManager.h b/arm_compute/runtime/IAssetManager.h
index 98ad9175b1..b3829e8018 100644
--- a/arm_compute/runtime/IAssetManager.h
+++ b/arm_compute/runtime/IAssetManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/IFunction.h b/arm_compute/runtime/IFunction.h
index 53ee5b537d..fb68dbbecf 100644
--- a/arm_compute/runtime/IFunction.h
+++ b/arm_compute/runtime/IFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@ public:
     virtual ~IFunction() = default;
     /** Run the kernels contained in the function
      *
-     * For NEON kernels:
+     * For CPU kernels:
      * - Multi-threading is used for the kernels which are parallelisable.
      * - By default std::thread::hardware_concurrency() threads are used.
      *
@@ -58,5 +58,5 @@ public:
     {
     }
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IFUNCTION_H */
diff --git a/arm_compute/runtime/ILifetimeManager.h b/arm_compute/runtime/ILifetimeManager.h
index 0ff028f03f..2a0cffa86c 100644
--- a/arm_compute/runtime/ILifetimeManager.h
+++ b/arm_compute/runtime/ILifetimeManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/ILutAllocator.h b/arm_compute/runtime/ILutAllocator.h
deleted file mode 100644
index 2e95cd6c37..0000000000
--- a/arm_compute/runtime/ILutAllocator.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_ILUTALLOCATOR_H
-#define ARM_COMPUTE_ILUTALLOCATOR_H
-
-#include "arm_compute/core/Types.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Basic interface to allocate LUTs' */
-class ILutAllocator
-{
-public:
-    /** Default constructor */
-    ILutAllocator();
-    /** Default virtual destructor */
-    virtual ~ILutAllocator() = default;
-    /** Allow instances of this class to be move constructed */
-    ILutAllocator(ILutAllocator &&) = default;
-    /** Allow instances of this class to be moved */
-    ILutAllocator &operator=(ILutAllocator &&) = default;
-    /** Allocate an LUT of the requested number of elements and data_type.
-     *
-     * @param[in] num_elements Number of elements of the LUT.
-     * @param[in] data_type    Data type of each element.
-     */
-    void init(size_t num_elements, DataType data_type);
-    /** Returns the total number of elements in the LUT.
-     *
-     * @return Total number of elements.
-     */
-    size_t num_elements() const;
-    /** Returns the type of the LUT.
-     *
-     * @return The type of the LUT.
-     */
-    DataType type() const;
-    /** Returns the total size in bytes of the LUT.
-     *
-     * @return Total size of the LUT in bytes.
-     */
-    size_t size() const;
-
-protected:
-    /** Interface to be implemented by the child class to allocate the LUT. */
-    virtual void allocate() = 0;
-    /** Interface to be implemented by the child class to lock the memory allocation for the CPU to access.
-     *
-     * @return Pointer to a CPU mapping of the memory
-     */
-    virtual uint8_t *lock() = 0;
-    /** Interface to be implemented by the child class to unlock the memory allocation after the CPU is done accessing it. */
-    virtual void unlock() = 0;
-
-private:
-    size_t   _num_elements; /**< Number of elements allocated */
-    DataType _data_type;    /**< Data type of LUT elements. */
-};
-}
-#endif /* ARM_COMPUTE_ILUTALLOCATOR_H */
diff --git a/arm_compute/runtime/IMemory.h b/arm_compute/runtime/IMemory.h
index c4ae2b5f44..0aeb112a5e 100644
--- a/arm_compute/runtime/IMemory.h
+++ b/arm_compute/runtime/IMemory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/IMemoryGroup.h b/arm_compute/runtime/IMemoryGroup.h
index ef8c835727..77198dd29d 100644
--- a/arm_compute/runtime/IMemoryGroup.h
+++ b/arm_compute/runtime/IMemoryGroup.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,8 +86,7 @@ public:
      *
      * @param[in] memory_group Memory group to handle
      */
-    explicit MemoryGroupResourceScope(IMemoryGroup &memory_group)
-        : _memory_group(memory_group)
+    explicit MemoryGroupResourceScope(IMemoryGroup &memory_group) : _memory_group(memory_group)
     {
         _memory_group.acquire();
     }
@@ -100,5 +99,5 @@ public:
 private:
     IMemoryGroup &_memory_group;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IMEMORYGROUP_H */
diff --git a/arm_compute/runtime/IMemoryManager.h b/arm_compute/runtime/IMemoryManager.h
index ee6837a62e..42910edfda 100644
--- a/arm_compute/runtime/IMemoryManager.h
+++ b/arm_compute/runtime/IMemoryManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -65,5 +65,5 @@ public:
      */
     virtual void clear() = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IMEMORYMANAGER_H */
diff --git a/arm_compute/runtime/IMemoryPool.h b/arm_compute/runtime/IMemoryPool.h
index 86db2ce9fc..0c112c8f35 100644
--- a/arm_compute/runtime/IMemoryPool.h
+++ b/arm_compute/runtime/IMemoryPool.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,5 +60,5 @@ public:
      */
     virtual std::unique_ptr<IMemoryPool> duplicate() = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_IMEMORYPOOL_H */
diff --git a/arm_compute/runtime/IMemoryRegion.h b/arm_compute/runtime/IMemoryRegion.h
index 1a7250bbe3..9431663e4e 100644
--- a/arm_compute/runtime/IMemoryRegion.h
+++ b/arm_compute/runtime/IMemoryRegion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,8 +37,7 @@ public:
      *
      * @param[in] size Region size
      */
-    explicit IMemoryRegion(size_t size)
-        : _size(size)
+    explicit IMemoryRegion(size_t size) : _size(size)
     {
     }
     /** Virtual Destructor */
diff --git a/arm_compute/core/IMultiImage.h b/arm_compute/runtime/IOperator.h
index 3abdfed8a8..fd285160e9 100644
--- a/arm_compute/core/IMultiImage.h
+++ b/arm_compute/runtime/IOperator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,40 +21,44 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_IMULTIIMAGE_H
-#define ARM_COMPUTE_IMULTIIMAGE_H
+#ifndef ARM_COMPUTE_IOPERATOR_H
+#define ARM_COMPUTE_IOPERATOR_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
 
 namespace arm_compute
 {
-class ITensor;
-using IImage = ITensor;
-class MultiImageInfo;
-
-/** Interface for multi-planar images */
-class IMultiImage
+class ITensorPack;
+namespace experimental
+{
+/** Base class for all async functions */
+class IOperator
 {
 public:
     /** Destructor */
-    virtual ~IMultiImage() = default;
-    /** Interface to be implemented by the child class to return the multi-planar image's metadata
-     *
-     * @return A pointer to the image's metadata.
-     */
-    virtual const MultiImageInfo *info() const = 0;
-    /** Return a pointer to the requested plane of the image.
+    virtual ~IOperator() = default;
+    /** Run the kernels contained in the function
      *
-     * @param[in] index The index of the wanted planed.
+     * @param[in] tensors Vector that contains the tensors to operate on.
      *
-     *  @return A pointer pointed to the plane
      */
-    virtual IImage *plane(unsigned int index) = 0;
-    /** Return a constant pointer to the requested plane of the image.
+    virtual void run(ITensorPack &tensors) = 0;
+    /** Prepare the function for executing
+     *
+     * Any one off pre-processing step required by the function is handled here
      *
-     * @param[in] index The index of the wanted planed.
+     * @param[in] constants Vector that contains the constants tensors.
      *
-     *  @return A constant pointer pointed to the plane
+     * @note Prepare stage might not need all the function's buffers' backing memory to be available in order to execute
+     */
+    virtual void prepare(ITensorPack &constants) = 0;
+
+    /** Return the memory requirements required by the workspace
      */
-    virtual const IImage *plane(unsigned int index) const = 0;
+    virtual MemoryRequirements workspace() const = 0;
 };
-}
-#endif /*ARM_COMPUTE_IMULTIIMAGE_H */
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IOPERATOR_H */
diff --git a/arm_compute/runtime/IPoolManager.h b/arm_compute/runtime/IPoolManager.h
index c654f827ee..5f6d4ffbe5 100644
--- a/arm_compute/runtime/IPoolManager.h
+++ b/arm_compute/runtime/IPoolManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,5 +69,5 @@ public:
      */
     virtual size_t num_pools() const = 0;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_IPOOLMANAGER_H */
diff --git a/arm_compute/runtime/IRuntimeContext.h b/arm_compute/runtime/IRuntimeContext.h
index 44a77f15a0..8e61cfb8a8 100644
--- a/arm_compute/runtime/IRuntimeContext.h
+++ b/arm_compute/runtime/IRuntimeContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,6 @@ namespace arm_compute
 // Forward declarations
 class IScheduler;
 class IAssetManager;
-struct DeviceProperties;
 
 /** Context interface */
 class IRuntimeContext
@@ -51,11 +50,6 @@ public:
      * @return The asset manager registered to the context
      */
     virtual IAssetManager *asset_manager() = 0;
-    /** Device propertied accessor
-     *
-     * @return Device properties
-     */
-    virtual const DeviceProperties &properties() = 0;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_IRUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index a5e20ee627..ae204c8560 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,8 @@
 #define ARM_COMPUTE_ISCHEDULER_H
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 
 #include <functional>
 #include <limits>
@@ -32,6 +34,8 @@
 namespace arm_compute
 {
 class ICPPKernel;
+class ITensor;
+class Window;
 
 /** Scheduler interface to run kernels */
 class IScheduler
@@ -44,6 +48,13 @@ public:
         DYNAMIC, /**< Split the workload dynamically using a bucket system */
     };
 
+    /** Function to be used and map a given thread id to a logical core id
+     *
+     * Mapping function expects the thread index and total number of cores as input,
+     * and returns the logical core index to bind against
+     */
+    using BindFunc = std::function<int(int, int)>;
+
     /** When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value
      * then the schedular is free to break down the problem space over as many dimensions
      * as it wishes
@@ -116,9 +127,9 @@ public:
         }
 
     private:
-        unsigned int _split_dimension;
-        StrategyHint _strategy;
-        int          _threshold;
+        unsigned int _split_dimension{};
+        StrategyHint _strategy{};
+        int          _threshold{};
     };
     /** Signature for the workloads to execute */
     using Workload = std::function<void(const ThreadInfo &)>;
@@ -134,7 +145,14 @@ public:
      */
     virtual void set_num_threads(unsigned int num_threads) = 0;
 
-    /** Returns the number of threads that the SingleThreadScheduler has in his pool.
+    /** Sets the number of threads the scheduler will use to run the kernels but also using a binding function to pin the threads to given logical cores
+     *
+     * @param[in] num_threads If set to 0, then one thread per CPU core available on the system will be used, otherwise the number of threads specified.
+     * @param[in] func        Binding function to use.
+     */
+    virtual void set_num_threads_with_affinity(unsigned int num_threads, BindFunc func);
+
+    /** Returns the number of threads that the SingleThreadScheduler has in its pool.
      *
      * @return Number of threads available in SingleThreadScheduler.
      */
@@ -147,11 +165,20 @@ public:
      */
     virtual void schedule(ICPPKernel *kernel, const Hints &hints) = 0;
 
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] window  Window to use for kernel execution.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    virtual void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) = 0;
+
     /** Execute all the passed workloads
      *
-     * @note there is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
+     * @note There is no guarantee regarding the order in which the workloads will be executed or whether or not they will be executed in parallel.
      *
-     * @param[in] workloads Array of workloads to run
+     * @param[in] workloads List of workloads to run
      * @param[in] tag       String that can be used by profiling tools to identify the workloads run by the scheduler (Can be null).
      */
     virtual void run_tagged_workloads(std::vector<Workload> &workloads, const char *tag);
@@ -178,7 +205,32 @@ protected:
      * @param[in] workloads Array of workloads to run
      */
     virtual void run_workloads(std::vector<Workload> &workloads) = 0;
-    CPUInfo _cpu_info;
+
+    /** Common scheduler logic to execute the given kernel
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] window  Window to use for kernel execution.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    void schedule_common(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors);
+
+    /** Adjust the number of windows to the optimize performance
+     * (used for small workloads where smaller number of threads might improve the performance)
+     *
+     * @param[in] window           Window to use for kernel execution
+     * @param[in] split_dimension  Axis of dimension to split
+     * @param[in] init_num_windows Initial number of sub-windows to split
+     * @param[in] kernel           Kernel to execute
+     * @param[in] cpu_info         The CPU platform used to create the context.
+     *
+     * @return Adjusted number of windows
+     */
+    std::size_t adjust_num_of_windows(const Window     &window,
+                                      std::size_t       split_dimension,
+                                      std::size_t       init_num_windows,
+                                      const ICPPKernel &kernel,
+                                      const CPUInfo    &cpu_info);
 
 private:
     unsigned int _num_threads_hint = {};
diff --git a/arm_compute/runtime/ISimpleLifetimeManager.h b/arm_compute/runtime/ISimpleLifetimeManager.h
index d71d787c88..9e481bb563 100644
--- a/arm_compute/runtime/ISimpleLifetimeManager.h
+++ b/arm_compute/runtime/ISimpleLifetimeManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ILifetimeManager.h"
-
 #include "arm_compute/runtime/IMemoryPool.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -70,7 +69,11 @@ protected:
     /** Element struct */
     struct Element
     {
-        Element(void *id_ = nullptr, IMemory *handle_ = nullptr, size_t size_ = 0, size_t alignment_ = 0, bool status_ = false)
+        Element(void    *id_        = nullptr,
+                IMemory *handle_    = nullptr,
+                size_t   size_      = 0,
+                size_t   alignment_ = 0,
+                bool     status_    = false)
             : id(id_), handle(handle_), size(size_), alignment(alignment_), status(status_)
         {
         }
@@ -90,11 +93,12 @@ protected:
         std::set<void *> bound_elements;
     };
 
-    IMemoryGroup *_active_group;                                           /**< Active group */
-    std::map<void *, Element> _active_elements;                            /**< A map that contains the active elements */
-    std::list<Blob> _free_blobs;                                           /**< Free blobs */
-    std::list<Blob> _occupied_blobs;                                       /**< Occupied blobs */
-    std::map<IMemoryGroup *, std::map<void *, Element>> _finalized_groups; /**< A map that contains the finalized groups */
+    IMemoryGroup             *_active_group;    /**< Active group */
+    std::map<void *, Element> _active_elements; /**< A map that contains the active elements */
+    std::list<Blob>           _free_blobs;      /**< Free blobs */
+    std::list<Blob>           _occupied_blobs;  /**< Occupied blobs */
+    std::map<IMemoryGroup *, std::map<void *, Element>>
+        _finalized_groups; /**< A map that contains the finalized groups */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_ISIMPLELIFETIMEMANAGER_H */
diff --git a/arm_compute/runtime/ITensorAllocator.h b/arm_compute/runtime/ITensorAllocator.h
index 74100f49fe..e2d3536169 100644
--- a/arm_compute/runtime/ITensorAllocator.h
+++ b/arm_compute/runtime/ITensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,7 @@ class ITensorAllocator
 {
 public:
     /** Default constructor. */
-    ITensorAllocator();
+    ITensorAllocator() = default;
     /** Allow instances of this class to be copy constructed */
     ITensorAllocator(const ITensorAllocator &) = default;
     /** Allow instances of this class to be copied */
@@ -54,6 +54,14 @@ public:
      * @param[in] alignment Alignment in bytes that the underlying base pointer should comply with.
      */
     void init(const TensorInfo &input, size_t alignment = 0);
+    /** Initialize a tensor based with a reference TensorInfo
+     *
+     * @note ITensorAllocator won't own the TensorInfo thus these need to out-live
+     *
+     * @param[in] input     TensorInfo object containing the description of the tensor to initialize.
+     * @param[in] alignment Alignment in bytes that the underlying base pointer should comply with.
+     */
+    void soft_init(TensorInfo &input, size_t alignment = 0);
     /** Return a reference to the tensor's metadata
      *
      * @return Reference to the tensor's metadata.
@@ -93,8 +101,9 @@ protected:
     virtual void unlock() = 0;
 
 private:
-    TensorInfo _info;      /**< Tensor's metadata. */
-    size_t     _alignment; /**< Tensor's alignment in bytes */
+    TensorInfo  _info_owned{};           /**< Tensor's metadata. */
+    TensorInfo *_info_external{nullptr}; /**< External Tensor's metadata */
+    size_t      _alignment{};            /**< Tensor's alignment in bytes */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_ITENSORALLOCATOR_H */
diff --git a/arm_compute/runtime/ITransformWeights.h b/arm_compute/runtime/ITransformWeights.h
index 4981c5fdec..08671bbe3c 100644
--- a/arm_compute/runtime/ITransformWeights.h
+++ b/arm_compute/runtime/ITransformWeights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ITRANSFORMWEIGHTS_H
 
 #include <atomic>
+#include <utility>
 
 namespace arm_compute
 {
@@ -36,9 +37,8 @@ class ITensor;
  * to generate a unique id. We use the following conversion using an unsigned 32bit value:
  *
  * Lower two bits store the target:
- * 00 -> NEON
+ * 00 -> Neon
  * 01 -> CL
- * 10 -> GLES
  * 11 -> Unused
  *
  * Five bits store the id of the reshape function:
@@ -72,7 +72,7 @@ public:
     /** Allow instances of this class to be moved */
     ITransformWeights &operator=(ITransformWeights &&other)
     {
-        if(this != &other)
+        if (this != &other)
         {
             _num_refcount = other._num_refcount.load();
             _reshape_run  = other._reshape_run;
@@ -119,9 +119,9 @@ public:
     }
 
 protected:
-    std::atomic<int32_t> _num_refcount{ 0 };
-    bool                 _reshape_run{ false };
+    std::atomic<int32_t> _num_refcount{0};
+    bool                 _reshape_run{false};
 };
-} // arm_compute
+} // namespace arm_compute
 
-#endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
-\ No newline at end of file
+#endif /*ARM_COMPUTE_ITRANSFORMWEIGHTS_H */
diff --git a/arm_compute/runtime/IWeightsManager.h b/arm_compute/runtime/IWeightsManager.h
index 8df571befa..de8a92faa3 100644
--- a/arm_compute/runtime/IWeightsManager.h
+++ b/arm_compute/runtime/IWeightsManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -76,10 +76,28 @@ public:
      * @return True if the weights tensor is managed else false
      */
     bool are_weights_managed(const ITensor *weights);
+    /** Release weights refcount and mark as unused if reaches 0
+     *
+     * @param[in] weights Weights to release
+     */
+    void release(const ITensor *weights);
+    /** Pre-mark the weights as unused. The weights tensor will get marked as unused only when the counter goes to 0
+     *
+     * @param weights Weights to mark unused
+     */
+    void pre_mark_as_unused(const ITensor *weights);
+
+private:
+    struct CounterElement
+    {
+        bool             is_unused{false};
+        std::atomic<int> counter{1};
+    };
 
 private:
     std::map<const ITensor *, std::vector<ITransformWeights *>> _managed_weights;
+    std::map<const ITensor *, CounterElement>                   _managed_counter;
     std::map<const ITensor *, ITransformWeights *>              _managed_weights_parents;
 };
-} // arm_compute
-#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
-\ No newline at end of file
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_IWEIGHTSMANAGER_H */
diff --git a/arm_compute/runtime/Lut.h b/arm_compute/runtime/Lut.h
deleted file mode 100644
index d3e74f3ee9..0000000000
--- a/arm_compute/runtime/Lut.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_LUT_H
-#define ARM_COMPUTE_LUT_H
-
-#include "arm_compute/core/ILut.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/LutAllocator.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class ILutAllocator;
-
-/** Basic implementation of the LUT interface */
-class Lut : public ILut
-{
-public:
-    /** Constructor */
-    Lut();
-    /** Constructor: initializes a LUT which can contain num_values values of data_type type.
-     *
-     * @param[in] num_elements Number of elements of the LUT.
-     * @param[in] data_type    Data type of each element.
-     */
-    Lut(size_t num_elements, DataType data_type);
-    /** Return a pointer to the lut's allocator
-     *
-     * @return A pointer to the lut's allocator
-     */
-    ILutAllocator *allocator();
-
-    // Inherited methods overridden:
-    size_t   num_elements() const override;
-    uint32_t index_offset() const override;
-    size_t   size_in_bytes() const override;
-    DataType type() const override;
-    uint8_t *buffer() const override;
-    void     clear() override;
-
-private:
-    LutAllocator _allocator; /**< Instance of the basic CPU allocator.*/
-};
-}
-#endif /* ARM_COMPUTE_LUT_H */
diff --git a/arm_compute/runtime/LutAllocator.h b/arm_compute/runtime/LutAllocator.h
deleted file mode 100644
index f84109cbf2..0000000000
--- a/arm_compute/runtime/LutAllocator.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_LUTALLOCATOR_H
-#define ARM_COMPUTE_LUTALLOCATOR_H
-
-#include "arm_compute/runtime/ILutAllocator.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic implementation of a CPU memory LUT allocator. */
-class LutAllocator : public ILutAllocator
-{
-public:
-    /** Default constructor. */
-    LutAllocator();
-    /** Interface to be implemented by the child class to return the pointer to the allocate data.
-     *
-     * @return a pointer to the data.
-     */
-    uint8_t *data() const;
-
-protected:
-    /** Allocate num_elements() * sizeof(type()) of CPU memory. */
-    void allocate() override;
-    /** No-op for CPU memory
-     *
-     * @return A pointer to the beginning of the look up table's allocation.
-     */
-    uint8_t *lock() override;
-    /** No-op for CPU memory. */
-    void unlock() override;
-
-private:
-    mutable std::vector<uint8_t> _buffer; /**< CPU memory allocation. */
-};
-}
-#endif /* ARM_COMPUTE_LUTALLOCATOR_H */
diff --git a/arm_compute/runtime/Memory.h b/arm_compute/runtime/Memory.h
index d895e06528..63514c409b 100644
--- a/arm_compute/runtime/Memory.h
+++ b/arm_compute/runtime/Memory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_MEMORY_H
 
 #include "arm_compute/runtime/IMemory.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
@@ -64,8 +63,8 @@ public:
     // Inherited methods overridden:
     IMemoryRegion *region() final;
     IMemoryRegion *region() const final;
-    void set_region(IMemoryRegion *region) final;
-    void set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
+    void           set_region(IMemoryRegion *region) final;
+    void           set_owned_region(std::unique_ptr<IMemoryRegion> region) final;
 
 private:
     IMemoryRegion                 *_region;
diff --git a/arm_compute/runtime/MemoryGroup.h b/arm_compute/runtime/MemoryGroup.h
index 4d484954a1..93ea3d2c72 100644
--- a/arm_compute/runtime/MemoryGroup.h
+++ b/arm_compute/runtime/MemoryGroup.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_MEMORYGROUP_H
 #define ARM_COMPUTE_MEMORYGROUP_H
 
-#include "arm_compute/runtime/IMemoryGroup.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/Macros.h"
+#include "arm_compute/runtime/IMemoryGroup.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IMemoryPool.h"
 
@@ -59,8 +58,8 @@ public:
     // Inherited methods overridden:
     void manage(IMemoryManageable *obj) override;
     void finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment) override;
-    void            acquire() override;
-    void            release() override;
+    void acquire() override;
+    void release() override;
     MemoryMappings &mappings() override;
 
 private:
@@ -70,15 +69,13 @@ private:
 };
 
 inline MemoryGroup::MemoryGroup(std::shared_ptr<IMemoryManager> memory_manager) noexcept
-    : _memory_manager(memory_manager),
-      _pool(nullptr),
-      _mappings()
+    : _memory_manager(memory_manager), _pool(nullptr), _mappings()
 {
 }
 
 inline void MemoryGroup::manage(IMemoryManageable *obj)
 {
-    if(_memory_manager && (obj != nullptr))
+    if (_memory_manager && (obj != nullptr))
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
 
@@ -95,7 +92,7 @@ inline void MemoryGroup::manage(IMemoryManageable *obj)
 
 inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_memory, size_t size, size_t alignment)
 {
-    if(_memory_manager)
+    if (_memory_manager)
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->lifetime_manager());
         _memory_manager->lifetime_manager()->end_lifetime(obj, obj_memory, size, alignment);
@@ -104,7 +101,7 @@ inline void MemoryGroup::finalize_memory(IMemoryManageable *obj, IMemory &obj_me
 
 inline void MemoryGroup::acquire()
 {
-    if(!_mappings.empty())
+    if (!_mappings.empty())
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
         _pool = _memory_manager->pool_manager()->lock_pool();
@@ -114,7 +111,7 @@ inline void MemoryGroup::acquire()
 
 inline void MemoryGroup::release()
 {
-    if(_pool != nullptr)
+    if (_pool != nullptr)
     {
         ARM_COMPUTE_ERROR_ON(!_memory_manager->pool_manager());
         ARM_COMPUTE_ERROR_ON(_mappings.empty());
@@ -128,5 +125,5 @@ inline MemoryMappings &MemoryGroup::mappings()
 {
     return _mappings;
 }
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_MEMORYGROUP_H */
diff --git a/arm_compute/runtime/MemoryManagerOnDemand.h b/arm_compute/runtime/MemoryManagerOnDemand.h
index c56c8c0e2b..7c31fe7f5a 100644
--- a/arm_compute/runtime/MemoryManagerOnDemand.h
+++ b/arm_compute/runtime/MemoryManagerOnDemand.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,9 @@
 #ifndef ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
 #define ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H
 
-#include "arm_compute/runtime/IMemoryManager.h"
-
 #include "arm_compute/runtime/ILifetimeManager.h"
 #include "arm_compute/runtime/IMemoryGroup.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IPoolManager.h"
 
 #include <memory>
@@ -39,7 +38,8 @@ class MemoryManagerOnDemand : public IMemoryManager
 {
 public:
     /** Default Constructor */
-    MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager);
+    MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                          std::shared_ptr<IPoolManager>     pool_manager);
     /** Prevent instances of this class to be copy constructed */
     MemoryManagerOnDemand(const MemoryManagerOnDemand &) = delete;
     /** Prevent instances of this class to be copied */
@@ -52,12 +52,12 @@ public:
     // Inherited methods overridden:
     ILifetimeManager *lifetime_manager() override;
     IPoolManager     *pool_manager() override;
-    void populate(IAllocator &allocator, size_t num_pools) override;
-    void clear() override;
+    void              populate(IAllocator &allocator, size_t num_pools) override;
+    void              clear() override;
 
 private:
     std::shared_ptr<ILifetimeManager> _lifetime_mgr; /**< Lifetime manager */
     std::shared_ptr<IPoolManager>     _pool_mgr;     /**< Memory pool manager */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_MEMORY_MANAGER_ON_DEMAND_H */
diff --git a/arm_compute/runtime/MemoryRegion.h b/arm_compute/runtime/MemoryRegion.h
index 1b645be4a2..f8a4898281 100644
--- a/arm_compute/runtime/MemoryRegion.h
+++ b/arm_compute/runtime/MemoryRegion.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,8 @@
 #ifndef ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
 #define ARM_COMPUTE_RUNTIME_MEMORY_REGION_H
 
-#include "arm_compute/runtime/IMemoryRegion.h"
-
 #include "arm_compute/core/Error.h"
-#include "support/MemorySupport.h"
+#include "arm_compute/runtime/IMemoryRegion.h"
 
 #include <cstddef>
 
@@ -42,32 +40,27 @@ public:
      * @param[in] size      Region size
      * @param[in] alignment Alignment in bytes of the base pointer. Defaults to 0
      */
-    MemoryRegion(size_t size, size_t alignment = 0)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(size_t size, size_t alignment = 0) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
     {
-        if(size != 0)
+        if (size != 0)
         {
             // Allocate backing memory
             size_t space = size + alignment;
-            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr)
-            {
-                delete[] ptr;
-            });
-            _ptr = _mem.get();
+            _mem         = std::shared_ptr<uint8_t>(new uint8_t[space](), [](uint8_t *ptr) { delete[] ptr; });
+            _ptr         = _mem.get();
 
             // Calculate alignment offset
-            if(alignment != 0)
+            if (alignment != 0)
             {
                 void *aligned_ptr = _mem.get();
-                support::cpp11::align(alignment, size, aligned_ptr, space);
+                std::align(alignment, size, aligned_ptr, space);
                 _ptr = aligned_ptr;
             }
         }
     }
-    MemoryRegion(void *ptr, size_t size)
-        : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
+    MemoryRegion(void *ptr, size_t size) : IMemoryRegion(size), _mem(nullptr), _ptr(nullptr)
     {
-        if(size != 0)
+        if (size != 0)
         {
             _ptr = ptr;
         }
@@ -92,9 +85,9 @@ public:
     }
     std::unique_ptr<IMemoryRegion> extract_subregion(size_t offset, size_t size) final
     {
-        if(_ptr != nullptr && (offset < _size) && (_size - offset >= size))
+        if (_ptr != nullptr && (offset < _size) && (_size - offset >= size))
         {
-            return support::cpp14::make_unique<MemoryRegion>(static_cast<uint8_t *>(_ptr) + offset, size);
+            return std::make_unique<MemoryRegion>(static_cast<uint8_t *>(_ptr) + offset, size);
         }
         else
         {
diff --git a/arm_compute/runtime/MultiImage.h b/arm_compute/runtime/MultiImage.h
deleted file mode 100644
index 7505ad9e75..0000000000
--- a/arm_compute/runtime/MultiImage.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_MULTIIMAGE_H
-#define ARM_COMPUTE_MULTIIMAGE_H
-
-#include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <array>
-
-namespace arm_compute
-{
-class Coordinates;
-class ITensor;
-using IImage = ITensor;
-
-/** Basic implementation of the multi-planar image interface */
-class MultiImage : public IMultiImage
-{
-public:
-    /** Constructor */
-    MultiImage();
-    /** Allocate the multi-planar image
-     *
-     * @param[in] width  Width of the whole image
-     * @param[in] height Height of the whole image
-     * @param[in] format Format of the whole image
-     */
-    void init(unsigned int width, unsigned int height, Format format);
-    /** Allocate the multi-planar image
-     *
-     * @note Uses conservative padding strategy which fits all kernels.
-     *
-     * @param[in] width  Width of the whole image
-     * @param[in] height Height of the whole image
-     * @param[in] format Format of the whole image
-     */
-    void init_auto_padding(unsigned int width, unsigned int height, Format format);
-    /** Allocated a previously initialised multi image
-     *
-     * @note The multi image must not already be allocated when calling this function.
-     *
-     **/
-    void allocate();
-    /** Create a subimage from an existing MultiImage.
-     *
-     * @param[in] image  Image to use backing memory from
-     * @param[in] coords Starting coordinates of the new image. Should be within the parent image sizes
-     * @param[in] width  The width of the subimage
-     * @param[in] height The height of the subimage
-     */
-    void create_subimage(MultiImage *image, const Coordinates &coords, unsigned int width, unsigned int height);
-
-    // Inherited methods overridden:
-    const MultiImageInfo *info() const override;
-    Image *plane(unsigned int index) override;
-    const Image *plane(unsigned int index) const override;
-
-private:
-    /** Init the multi-planar image
-     *
-     * @param[in] width        Width of the whole image
-     * @param[in] height       Height of the whole image
-     * @param[in] format       Format of the whole image
-     * @param[in] auto_padding Specifies whether the image uses auto padding
-     */
-    void internal_init(unsigned int width, unsigned int height, Format format, bool auto_padding);
-
-    MultiImageInfo _info;        /** Instance of the multi-planar image's meta data */
-    std::array<Image, 3> _plane; /* Instance Image to hold the planar's information */
-};
-}
-#endif /*ARM_COMPUTE_MULTIIMAGE_H */
diff --git a/arm_compute/core/NEON/kernels/NEMemsetKernel.h b/arm_compute/runtime/NEON/INEOperator.h
index b4bcd11b82..7971168d24 100644
--- a/arm_compute/core/NEON/kernels/NEMemsetKernel.h
+++ b/arm_compute/runtime/NEON/INEOperator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,51 +21,56 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEMEMSETKERNEL_H
-#define ARM_COMPUTE_NEMEMSETKERNEL_H
+#ifndef ARM_COMPUTE_INEOPERATOR_H
+#define ARM_COMPUTE_INEOPERATOR_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/PixelValue.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+#include "../../core/ITensor.h"
+#include <memory>
 
 namespace arm_compute
 {
-// Forward declarations
-class ITensor;
+class ICPPKernel;
+class Window;
 
-/** Interface for filling the planes of a tensor */
-class NEMemsetKernel : public INEKernel
+using INEKernel = ICPPKernel;
+namespace experimental
+{
+/** Basic interface for functions which have a single async CPU kernel */
+class INEOperator : public IOperator
 {
 public:
-    const char *name() const override
-    {
-        return "NEMemsetKernel";
-    }
-    /** Default constructor */
-    NEMemsetKernel();
-    /** Default destructor */
-    ~NEMemsetKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMemsetKernel(const NEMemsetKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMemsetKernel &operator=(const NEMemsetKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMemsetKernel(NEMemsetKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMemsetKernel &operator=(NEMemsetKernel &&) = default;
-    /** Initialise the kernel's tensor and filling value
+    /** Constructor
      *
-     * @param[in,out] tensor         Input tensor to fill. Supported data types: All
-     * @param[in]     constant_value The value used to fill the planes of the tensor
+     * @param[in] ctx Runtime context to be used by the function
      */
-    void configure(ITensor *tensor, const PixelValue &constant_value);
+    INEOperator(IRuntimeContext *ctx = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEOperator(const INEOperator &) = delete;
+    /** Default move constructor */
+    INEOperator(INEOperator &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INEOperator &operator=(const INEOperator &) = delete;
+    /** Default move assignment operator */
+    INEOperator &operator=(INEOperator &&) = default;
+    /** Default destructor */
+    ~INEOperator();
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void               run(ITensorPack &tensors) override;
+    void               prepare(ITensorPack &constants) override;
+    MemoryRequirements workspace() const override;
+
+protected:
+    void run(ITensorPack &tensors, const Window &window);
 
-private:
-    ITensor   *_tensor;
-    PixelValue _constant_value;
+    std::unique_ptr<INEKernel> _kernel;
+    IRuntimeContext           *_ctx;
+    MemoryRequirements         _workspace;
 };
+} // namespace experimental
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMEMSETKERNEL_H */
+#endif /*ARM_COMPUTE_INEOPERATOR_H */
diff --git a/arm_compute/runtime/NEON/INESimpleFunction.h b/arm_compute/runtime/NEON/INESimpleFunction.h
index 8506797fe3..f783a836ee 100644
--- a/arm_compute/runtime/NEON/INESimpleFunction.h
+++ b/arm_compute/runtime/NEON/INESimpleFunction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,38 @@
 #ifndef ARM_COMPUTE_INESIMPLEFUNCTION_H
 #define ARM_COMPUTE_INESIMPLEFUNCTION_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
 
 namespace arm_compute
 {
-/** Basic interface for functions which have a single NEON kernel */
+class ICPPKernel;
+class NEFillBorderKernel;
+using INEKernel = ICPPKernel;
+/** Basic interface for functions which have a single CPU kernel */
 class INESimpleFunction : public IFunction
 {
 public:
     /** Constructor */
     INESimpleFunction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INESimpleFunction(const INESimpleFunction &) = delete;
+    /** Default move constructor */
+    INESimpleFunction(INESimpleFunction &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    INESimpleFunction &operator=(const INESimpleFunction &) = delete;
+    /** Default move assignment operator */
+    INESimpleFunction &operator=(INESimpleFunction &&) = default;
+    /** Default destructor */
+    ~INESimpleFunction();
 
     // Inherited methods overridden:
     void run() override final;
 
 protected:
-    std::unique_ptr<INEKernel> _kernel;         /**< Kernel to run */
-    NEFillBorderKernel         _border_handler; /**< Kernel to handle image borders */
+    std::unique_ptr<INEKernel>          _kernel;         /**< Kernel to run */
+    std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_INESIMPLEFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
index 223048f40f..dc4bac17e4 100644
--- a/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
+++ b/arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_INESIMPLEFUNCTIONNOBORDER_H
 #define ARM_COMPUTE_INESIMPLEFUNCTIONNOBORDER_H
 
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 
@@ -32,7 +31,9 @@
 
 namespace arm_compute
 {
-/** Basic interface for functions which have a single NEON kernel and no border */
+class ICPPKernel;
+using INEKernel = ICPPKernel;
+/** Basic interface for functions which have a single CPU kernel and no border */
 class INESimpleFunctionNoBorder : public IFunction
 {
 public:
@@ -49,6 +50,8 @@ public:
     INESimpleFunctionNoBorder &operator=(const INESimpleFunctionNoBorder &) = delete;
     /** Default move assignment operator */
     INESimpleFunctionNoBorder &operator=(INESimpleFunctionNoBorder &&) = default;
+    /** Default destructor */
+    ~INESimpleFunctionNoBorder();
 
     // Inherited methods overridden:
     void run() override final;
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index de364fa9af..cc4d303202 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,13 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEFUNCTIONS_H
-#define ARM_COMPUTE_NEFUNCTIONS_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
 
-/* Header regrouping all the NEON functions */
-#include "arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h"
-#include "arm_compute/runtime/NEON/functions/NEAccumulate.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
 #include "arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
@@ -38,18 +36,11 @@
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
-#include "arm_compute/runtime/NEON/functions/NEBox3x3.h"
-#include "arm_compute/runtime/NEON/functions/NECannyEdge.h"
 #include "arm_compute/runtime/NEON/functions/NECast.h"
-#include "arm_compute/runtime/NEON/functions/NEChannelCombine.h"
-#include "arm_compute/runtime/NEON/functions/NEChannelExtract.h"
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
-#include "arm_compute/runtime/NEON/functions/NECol2Im.h"
-#include "arm_compute/runtime/NEON/functions/NEColorConvert.h"
-#include "arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEConv3D.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolution.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
@@ -58,103 +49,66 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
 #include "arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDilate.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h"
-#include "arm_compute/runtime/NEON/functions/NEErode.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFastCorners.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 #include "arm_compute/runtime/NEON/functions/NEFillBorder.h"
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
+#include "arm_compute/runtime/NEON/functions/NEGather.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
-#include "arm_compute/runtime/NEON/functions/NEGather.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian3x3.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
 #include "arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDescriptor.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h"
-#include "arm_compute/runtime/NEON/functions/NEHarrisCorners.h"
-#include "arm_compute/runtime/NEON/functions/NEHistogram.h"
-#include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 #include "arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEIntegralImage.h"
 #include "arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h"
+#include "arm_compute/runtime/NEON/functions/NELogical.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianPyramid.h"
-#include "arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h"
-#include "arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEMagnitude.h"
-#include "arm_compute/runtime/NEON/functions/NEMeanStdDev.h"
+#include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEMedian3x3.h"
-#include "arm_compute/runtime/NEON/functions/NEMinMaxLocation.h"
-#include "arm_compute/runtime/NEON/functions/NENonLinearFilter.h"
-#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
 #include "arm_compute/runtime/NEON/functions/NENormalizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEOpticalFlow.h"
-#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
-#include "arm_compute/runtime/NEON/functions/NEPhase.h"
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEPooling3dLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPoolingLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 #include "arm_compute/runtime/NEON/functions/NEReduceMean.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NERemap.h"
+#include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
+#include "arm_compute/runtime/NEON/functions/NERNNLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEROIAlignLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
-#include "arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
-#include "arm_compute/runtime/NEON/functions/NESobel3x3.h"
-#include "arm_compute/runtime/NEON/functions/NESobel5x5.h"
-#include "arm_compute/runtime/NEON/functions/NESobel7x7.h"
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
 #include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h"
 #include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h"
 #include "arm_compute/runtime/NEON/functions/NESplit.h"
 #include "arm_compute/runtime/NEON/functions/NEStackLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
-#include "arm_compute/runtime/NEON/functions/NETableLookup.h"
-#include "arm_compute/runtime/NEON/functions/NEThreshold.h"
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/NEON/functions/NEUnstack.h"
-#include "arm_compute/runtime/NEON/functions/NEUpsampleLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWarpAffine.h"
-#include "arm_compute/runtime/NEON/functions/NEWarpPerspective.h"
 #include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEYOLOLayer.h"
 
-#endif /* ARM_COMPUTE_NEFUNCTIONS_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_NEFUNCTIONS_H
diff --git a/arm_compute/runtime/NEON/NEScheduler.h b/arm_compute/runtime/NEON/NEScheduler.h
index 54a92bbb41..613f44cc52 100644
--- a/arm_compute/runtime/NEON/NEScheduler.h
+++ b/arm_compute/runtime/NEON/NEScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,7 +28,7 @@
 
 namespace arm_compute
 {
-/** NEON Scheduler */
+/** CPU Scheduler */
 using NEScheduler = Scheduler;
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCHEDULER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h b/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
deleted file mode 100644
index 2d6f94cde0..0000000000
--- a/arm_compute/runtime/NEON/functions/NEAbsoluteDifference.h
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H
-#define ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEAbsoluteDifferenceKernel
- *
- * @note The image data type for the inputs must be U8 or S16
- * @note The function calculates the absolute difference also when the 2 inputs have different image data types
- */
-class NEAbsoluteDifference : public INESimpleFunction
-{
-public:
-    /** Set the inputs and output images
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16.
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16.
-     * @param[out] output Destination tensor. Data types supported: U8/S16.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-};
-}
-#endif /* ARM_COMPUTE_NEABSOLUTEDIFFERENCE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAccumulate.h b/arm_compute/runtime/NEON/functions/NEAccumulate.h
deleted file mode 100644
index 0426bf94d7..0000000000
--- a/arm_compute/runtime/NEON/functions/NEAccumulate.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEACCUMULATE_H
-#define ARM_COMPUTE_NEACCUMULATE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEAccumulateKernel */
-class NEAccumulate : public INESimpleFunctionNoBorder
-{
-public:
-    /** Set the input and accumulation tensors
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[out] output Destination tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *output);
-};
-
-/** Basic function to run @ref NEAccumulateWeightedKernel */
-class NEAccumulateWeighted : public INESimpleFunctionNoBorder
-{
-public:
-    /** Set the input and accumulation tensors, and the scale value
-     *
-     * @param[in]     input    Source tensor. Data type supported: U8.
-     * @param[in]     alpha    The input scalar value with a value input the range of [0, 1.0]
-     * @param[in,out] output   Accumulated tensor. Data type supported: U8.
-     * @param[in]     use_fp16 (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     */
-    void configure(const ITensor *input, float alpha, ITensor *output, bool use_fp16 = false);
-};
-
-/** Basic function to run @ref NEAccumulateSquaredKernel */
-class NEAccumulateSquared : public INESimpleFunctionNoBorder
-{
-public:
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input  Source tensor. Data type supported: U8.
-     * @param[in]     shift  The input with a value input the range of [0, 15]
-     * @param[in,out] output Accumulated tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, uint32_t shift, ITensor *output);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEACCUMULATE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEActivationLayer.h b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
index 95901dc2d8..5584fdc783 100644
--- a/arm_compute/runtime/NEON/functions/NEActivationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEActivationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,24 @@
 #ifndef ARM_COMPUTE_NEACTIVATIONLAYER_H
 #define ARM_COMPUTE_NEACTIVATIONLAYER_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEActivationLayerKernel
+/** Basic function to run @ref cpu::kernels::CpuActivationKernel
  *
  * @note The function simulates an activation layer with the specified activation function.
  */
-class NEActivationLayer : public INESimpleFunctionNoBorder
+class NEActivationLayer : public IFunction
 {
 public:
     /** Constructor
@@ -48,14 +52,28 @@ public:
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEActivationLayer(const NEActivationLayer &) = delete;
     /** Default move constructor */
-    NEActivationLayer(NEActivationLayer &&) = default;
+    NEActivationLayer(NEActivationLayer &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEActivationLayer &operator=(const NEActivationLayer &) = delete;
     /** Default move assignment operator */
-    NEActivationLayer &operator=(NEActivationLayer &&) = default;
+    NEActivationLayer &operator=(NEActivationLayer &&);
+    /** Destructor */
+    ~NEActivationLayer();
     /** [NEActivationLayer snippet] **/
     /** Set the input and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
@@ -75,6 +93,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEACTIVATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEAddMulAdd.h b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
new file mode 100644
index 0000000000..6c65c055dd
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEAddMulAdd.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD
+#define ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class ActivationLayerInfo;
+
+/** Function to compute Add+Mul+Add fused operation */
+class NEAddMulAdd : public IFunction
+{
+public:
+    /** Constructor */
+    NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAddMulAdd(const NEAddMulAdd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAddMulAdd(NEAddMulAdd &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEAddMulAdd &operator=(const NEAddMulAdd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEAddMulAdd &operator=(NEAddMulAdd &&) = delete;
+    /** Destructor */
+    ~NEAddMulAdd();
+    /** Initialize the function's inputs and outputs.
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * Valid data type configurations:
+     * |input1         |input2         |bn_mul         |bn_add         |add_output     |final_output   |
+     * |:--------------|:--------------|:--------------|:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |F32            |F32            |
+     *
+     * This is what this composite function (tailored for add followed by a batch norm operation) does:
+     *      add_output <- input1 + input2 (add)
+     *      final_output <- add_output * bn_mul + bn_add  (batch norm = mul+add)
+     *
+     * @param[in]  input1       First tensor input. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  input2       Second tensor input. Data types supported: Same as @p input.
+     * @param[in]  bn_mul       The multiplication coefficient on the feature dimension. Data types supported: Same as @p input.
+     *                          It's one dimensional tensor with size equal to the feature maps [FM]
+     * @param[in]  bn_add       The addition coefficient on the feature dimension. Data types supported: Same as @p input.
+     *                          It's one dimensional tensor with size equal to the feature maps [FM]
+     * @param[out] add_output   Output of the first add. Data type supported: Same as @p input.
+     * @param[out] final_output Output of the add+mul+add+act composite operation. Data type supported: Same as @p input.
+     * @param[in]  policy       Policy to handle overflow
+     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *bn_mul,
+                   ITensor                   *bn_add,
+                   ITensor                   *add_output,
+                   ITensor                   *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEAddMulAdd
+     *
+     * Similar to @ref NEAddMulAdd::configure() except the arguments are @ref ITensorInfo * instead of @ref ITensor *
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEADDMULADD */
diff --git a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
index c50f358d1f..3bb50a0f90 100644
--- a/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEArgMinMaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_NEARGMINMAXLAYER_H
 #define ARM_COMPUTE_NEARGMINMAXLAYER_H
 
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
@@ -33,11 +31,10 @@
 namespace arm_compute
 {
 class ITensor;
-
 /** Function to calculate the index of the minimum or maximum values in a
  *  tensor based on an axis.
  *
- *  This function calls the following NEON kernels:
+ *  This function calls the following kernels:
  *
  * -# @ref NEReductionOperationKernel
  * -# @ref NEFillBorderKernel
@@ -52,8 +49,30 @@ class NEArgMinMaxLayer : public IFunction
 public:
     /** Constructor */
     NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArgMinMaxLayer(const NEArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArgMinMaxLayer &operator=(const NEArgMinMaxLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEArgMinMaxLayer(NEArgMinMaxLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEArgMinMaxLayer &operator=(NEArgMinMaxLayer &&) = delete;
+    /** Default destructor */
+    ~NEArgMinMaxLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst           |
+     * |:--------------|:-------------|
+     * |QASYMM8        |U32, S32      |
+     * |QASYMM8_SIGNED |U32, S32      |
+     * |S32            |U32, S32, S64 |
+     * |F16            |U32, S32      |
+     * |F32            |U32, S32      |
+     *
      * @param[in]  input  Input source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in]  axis   Axis to find max/min index.
      * @param[out] output Output source tensor. Data types supported: U32/S32.
@@ -64,7 +83,7 @@ public:
      *
      * @param[in] input  Input source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/S32/F16/F32.
      * @param[in] axis   Axis to find max/min index.
-     * @param[in] output Output source tensor info. Data types supported: U32/S32.
+     * @param[in] output Output source tensor info. Data types supported: U32/S32/S64.
      * @param[in] op     Operation to perform: min or max
      *
      * @return a status
@@ -75,7 +94,8 @@ public:
     void run() override;
 
 private:
-    std::unique_ptr<NEReductionOperation> _reduction_function;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARGMINMAXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 6cab5b3547..73a43dbc44 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,36 +25,83 @@
 #define ARM_COMPUTE_NEARITHMETICADDITION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEArithmeticAdditionKernel */
-class NEArithmeticAddition : public INESimpleFunction
+/** Basic function to run @ref cpu::kernels::CpuAddKernel */
+class NEArithmeticAddition : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEArithmeticAddition();
+    /** Default Destructor */
+    ~NEArithmeticAddition();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition(const NEArithmeticAddition &) = delete;
+    /** Default move constructor */
+    NEArithmeticAddition(NEArithmeticAddition &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition &operator=(const NEArithmeticAddition &) = delete;
+    /** Default move assignment operator */
+    NEArithmeticAddition &operator=(NEArithmeticAddition &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in]  policy   Policy to use to handle overflow.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
      *
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] output   Output tensor. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] output   Output tensor info. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in] policy   Policy to use to handle overflow
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEARITHMETICADDITION_H */
+#endif /* ARM_COMPUTE_NEARITHMETICADDITION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 4774fb6adb..3e4f6356c5 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,43 +25,88 @@
 #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEArithmeticSubtractionKernel
+/** Basic function to run @ref cpu::kernels::CpuSubKernel
  *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
  * @note The function performs an arithmetic subtraction between two tensors.
  *
  *  This function calls the following kernels:
- * -# @ref NEArithmeticSubtractionKernel
+ * -# @ref cpu::kernels::CpuSubKernel
  */
-class NEArithmeticSubtraction : public INESimpleFunction
+class NEArithmeticSubtraction : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEArithmeticSubtraction();
+    /** Default Destructor */
+    ~NEArithmeticSubtraction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtraction(const NEArithmeticSubtraction &) = delete;
+    /** Default move constructor */
+    NEArithmeticSubtraction(NEArithmeticSubtraction &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtraction &operator=(const NEArithmeticSubtraction &) = delete;
+    /** Default move assignment operator */
+    NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |S16            |S16            |S16            |
+     * |S32            |S32            |S32            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
+     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
+     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/S32/F16/F32
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
-     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
-     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
+     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
+     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/F16/F32
      * @param[in] policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
index 14416e7323..99e2dcadbb 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,16 @@
 #ifndef ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
+class NEBatchNormalizationLayerKernel;
 
 /** Basic function to run @ref NENormalizationLayerKernel and simulate a batch normalization layer.
  *
@@ -42,10 +44,30 @@ class ITensor;
 class NEBatchNormalizationLayer : public IFunction
 {
 public:
-    /** Default constructor */
+    /** Constructor */
     NEBatchNormalizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayer(const NEBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchNormalizationLayer &operator=(const NEBatchNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchNormalizationLayer(NEBatchNormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchNormalizationLayer &operator=(NEBatchNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NEBatchNormalizationLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
+     *
      * @note If the output tensor is a nullptr or is equal to the input, the batch normalization function will be performed in-place
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
@@ -59,7 +81,13 @@ public:
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayer
      *
@@ -76,16 +104,20 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NEBatchNormalizationLayerKernel _norm_kernel; /**< Batch normalization layer kernel */
+    std::unique_ptr<NEBatchNormalizationLayerKernel> _norm_kernel; /**< Batch normalization layer kernel */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHNORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
index 2a62530246..ebed0bea29 100644
--- a/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEBatchToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,26 +24,49 @@
 #ifndef ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 #define ARM_COMPUTE_NEBATCHTOSPACELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEBatchToSpaceLayerKernel. */
 class NEBatchToSpaceLayer : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEBatchToSpaceLayer() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchToSpaceLayer(const NEBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBatchToSpaceLayer &operator=(const NEBatchToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchToSpaceLayer(NEBatchToSpaceLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBatchToSpaceLayer &operator=(NEBatchToSpaceLayer &&) = delete;
+    /** Default destructor */
+    ~NEBatchToSpaceLayer() = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |dst        |
+     * |:---------|:---------|:----------|
+     * |All       |s32       |All        |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
+     *
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
     void configure(const ITensor *input, const ITensor *block_shape, ITensor *output);
     /** Set the input and output tensors. (Static block shape).
      *
@@ -51,8 +74,13 @@ public:
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output);
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer
      *
      * @param[in]  input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
@@ -60,7 +88,9 @@ public:
      * @param[out] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
+     * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
+    ARM_COMPUTE_DEPRECATED_REL(23.05)
     static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayer (Static block shape).
      *
@@ -68,10 +98,15 @@ public:
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
      * @param[out] output        Tensor output info. Data types supported: same as @p input
+     * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBATCHTOSPACELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
index c254c30ce5..1f95f193d3 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseAnd.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,8 +34,28 @@ class ITensor;
 class NEBitwiseAnd : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEBitwiseAnd() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAnd(const NEBitwiseAnd &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEBitwiseAnd &operator=(const NEBitwiseAnd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBitwiseAnd(NEBitwiseAnd &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEBitwiseAnd &operator=(NEBitwiseAnd &&) = delete;
+    /** Default destructor */
+    ~NEBitwiseAnd() = default;
     /** Initialise the kernel's inputs and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
      * @param[out] output Output tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
index 15e12509c8..c66bebf7cc 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseNot.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,14 @@ class NEBitwiseNot : public INESimpleFunctionNoBorder
 public:
     /** Initialise the kernel's input and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input  Input tensor. Data type supported: U8.
      * @param[out] output Output tensor. Data type supported: U8.
      */
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
index 0e62620fc0..183df212e4 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseOr.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,14 @@ class NEBitwiseOr : public INESimpleFunctionNoBorder
 public:
     /** Initialise the kernel's inputs and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
      * @param[out] output Output tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
index 1dcc6e2216..126aaa6ddd 100644
--- a/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
+++ b/arm_compute/runtime/NEON/functions/NEBitwiseXor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,6 +36,14 @@ class NEBitwiseXor : public INESimpleFunctionNoBorder
 public:
     /** Initialise the kernel's inputs and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |U8             |U8             |
+     *
      * @param[in]  input1 First tensor input. Data type supported: U8.
      * @param[in]  input2 Second tensor input. Data type supported: U8.
      * @param[out] output Output tensor. Data type supported: U8.
diff --git a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
index 27c1c5198b..aa41fc0df2 100644
--- a/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
+++ b/arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,23 +24,31 @@
 #ifndef ARM_COMPUTE_NEBOUNDINGBOXTRANSOFORM_H
 #define ARM_COMPUTE_NEBOUNDINGBOXTRANSOFORM_H
 
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEBoundingBoxTransformKernel.
- *
- * This function calls the following Neon kernels:
- * -# @ref NEBoundingBoxTransformKernel
- */
-class NEBoundingBoxTransform : public INESimpleFunction
+/** Basic function to run @ref NEBoundingBoxTransformKernel. */
+class NEBoundingBoxTransform : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM16       |QASYMM8        |QASYMM16       |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in]  boxes      Source tensor. Bounding box proposals in pixel coordinates. Size(M, 4), format [x1, y1, x2, y2]. Data types supported: QASYMM16/F16/F32.
      * @param[out] pred_boxes Destination tensor. Pixel coordinates of the transformed bounding boxes. Size (M, 4*K), format [x1, y1, x2, y2]. Data types supported: Same as @p input
      * @param[in]  deltas     Bounding box translations and scales. Size (M, 4*K), format [dx, dy, dw, dh], K  is the number of classes.
@@ -49,7 +57,8 @@ public:
      *
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEBoundingBoxTransform
      *
@@ -63,7 +72,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEBOUNDINGBOXTRANSFORM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEBox3x3.h b/arm_compute/runtime/NEON/functions/NEBox3x3.h
deleted file mode 100644
index c382ea9114..0000000000
--- a/arm_compute/runtime/NEON/functions/NEBox3x3.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBOX3x3_H
-#define ARM_COMPUTE_NEBOX3x3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute box filter 3x3. This function calls the following NEON kernels:
- *
- *  -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- *  -# @ref NEBox3x3Kernel
- *
- */
-class NEBox3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's input, output and border mode.
-     *
-     * @note The border handler is run on the input tensor.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor, Data type supported: U8.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      use_fp16              (Optional) If true the FP16 kernels will be used. If false F32 kernels are used.
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0, bool use_fp16 = false);
-};
-}
-#endif /*ARM_COMPUTE_NEBOX3x3_H */
diff --git a/arm_compute/runtime/NEON/functions/NECannyEdge.h b/arm_compute/runtime/NEON/functions/NECannyEdge.h
deleted file mode 100644
index 84cc2de6d6..0000000000
--- a/arm_compute/runtime/NEON/functions/NECannyEdge.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECANNYEDGE_H
-#define ARM_COMPUTE_NECANNYEDGE_H
-
-#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute canny edge on NEON. This function calls the following NEON kernels and functions:
- *
- *  -# @ref NEFillBorderKernel (if border_mode == REPLICATE or border_mode == CONSTANT)
- *  -# @ref NESobel3x3 (if gradient_size == 3) or
- *     @ref NESobel5x5 (if gradient_size == 5) or
- *     @ref NESobel7x7 (if gradient_size == 7)
- *  -# @ref NEGradientKernel
- *  -# @ref NEEdgeNonMaxSuppressionKernel
- *  -# @ref NEEdgeTraceKernel
- *
- */
-class NECannyEdge : public IFunction
-{
-public:
-    /** Constructor
-     *
-     * Initialize Sobel kernel to nullptr.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    NECannyEdge(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECannyEdge(const NECannyEdge &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NECannyEdge &operator=(const NECannyEdge &) = delete;
-    /** Initialise the function's source, destination, thresholds, gradient size, normalization type and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: U8.
-     * @param[in]      upper_thr             Upper threhold used for the hysteresis
-     * @param[in]      lower_thr             Lower threshold used for the hysteresis.
-     * @param[in]      gradient_size         Gradient size (3, 5 or 7)
-     * @param[in]      norm_type             Normalization type. If 1, L1-Norm otherwise L2-Norm
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                   _memory_group;        /**< Function's memory group */
-    std::unique_ptr<IFunction>    _sobel;               /**< Pointer to Sobel kernel */
-    std::unique_ptr<INEKernel>    _gradient;            /**< Gradient kernel */
-    NEEdgeNonMaxSuppressionKernel _non_max_suppr;       /**< Non-Maxima suppression kernel */
-    NEEdgeTraceKernel             _edge_trace;          /**< Edge tracing kernel */
-    NEFillBorderKernel            _border_mag_gradient; /**< Fill border on magnitude tensor kernel */
-    NEFillBorderKernel            _border_edge_trace;   /**< Fill border before edge trace */
-    Tensor                        _gx;                  /**< Source tensor - Gx component */
-    Tensor                        _gy;                  /**< Source tensor - Gy component */
-    Tensor                        _magnitude;           /**< Source tensor - Magnitude */
-    Tensor                        _phase;               /**< Source tensor - Phase */
-    Tensor                        _nonmax;              /**< Source tensor - Non-Maxima suppressed */
-    ITensor                      *_output;              /**< Output tensor provided by the user. */
-};
-}
-#endif /* ARM_COMPUTE_NECANNYEDGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NECast.h b/arm_compute/runtime/NEON/functions/NECast.h
index 55c21a01ec..43cae777f6 100644
--- a/arm_compute/runtime/NEON/functions/NECast.h
+++ b/arm_compute/runtime/NEON/functions/NECast.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,47 +25,73 @@
 #define ARM_COMPUTE_NECAST_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEDepthConvertLayerKernel.
+/** Basic function to run @ref cpu::kernels::CpuCastKernel.
  * This function ignores the scale and zeroPoint of quanized tensors,so QASYMM8 input is treated as uint8 values.
  */
-class NECast : public INESimpleFunction
+class NECast : public IFunction
 {
 public:
+    /** Constructor */
+    NECast();
+    /** Destructor */
+    ~NECast();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECast(const NECast &) = delete;
+    /** Default move constructor */
+    NECast(NECast &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECast &operator=(const NECast &) = delete;
+    /** Default move assignment operator */
+    NECast &operator=(NECast &&);
     /** Initialize the function's source, destination
      *
-     * Input data type must be different than output data type.
+     * Valid data layouts:
+     * - All
      *
-     * Valid conversions Input -> Output :
+     * Valid data type configurations:
+     * |src            |dst                                             |
+     * |:--------------|:-----------------------------------------------|
+     * |QASYMM8_SIGNED | S16, S32, F32, F16                             |
+     * |QASYMM8        | U16, S16, S32, F32, F16                        |
+     * |U8             | U16, S16, S32, F32, F16                        |
+     * |U16            | U8, U32                                        |
+     * |S16            | QASYMM8_SIGNED, U8, S32                        |
+     * |F16            | QASYMM8_SIGNED, QASYMM8, F32, S32, U8          |
+     * |S32            | QASYMM8_SIGNED, QASYMM8, F16, F32, U8          |
+     * |F32            | QASYMM8_SIGNED, QASYMM8, BFLOAT16, F16, S32, U8|
      *
-     *   - QASYMM8_SIGNED -> S16, S32, F32, F16
-     *   - QASYMM8 -> U16, S16, S32, F32, F16
-     *   - U8 -> U16, S16, S32, F32, F16
-     *   - U16 -> U8, U32
-     *   - S16 -> QASYMM8_SIGNED, U8, S32
-     *   - F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
-     *   - S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
-     *   - F32 -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
-     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] output The output tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
      * @param[in]  policy Conversion policy.
      */
     void configure(ITensor *input, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NECast
      *
      * @param[in] input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/F16/S32/F32.
-     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/U8/S8/U16/S16/U32/S32/BFLOAT16/F16/F32.
      * @param[in] policy Conversion policy.
      *
      * @return a status
      */
-    static Status validate(ITensorInfo *input, ITensorInfo *output, ConvertPolicy policy);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NECAST_H*/
diff --git a/arm_compute/runtime/NEON/functions/NEChannelCombine.h b/arm_compute/runtime/NEON/functions/NEChannelCombine.h
deleted file mode 100644
index ba159160e0..0000000000
--- a/arm_compute/runtime/NEON/functions/NEChannelCombine.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELCOMBINE_H
-#define ARM_COMPUTE_NECHANNELCOMBINE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/**Basic function to run @ref NEChannelCombineKernel to perform channel combination. */
-class NEChannelCombine : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
-     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     */
-    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
-    /** Initialize function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[out] output The multi planar output image. Formats supported: NV12/NV21/IYUV/YUV444
-     */
-    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECHANNELCOMBINE_H*/
diff --git a/arm_compute/runtime/NEON/functions/NEChannelExtract.h b/arm_compute/runtime/NEON/functions/NEChannelExtract.h
deleted file mode 100644
index 96ba1c8ecc..0000000000
--- a/arm_compute/runtime/NEON/functions/NEChannelExtract.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELEXTRACT_H
-#define ARM_COMPUTE_NECHANNELEXTRACT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/**Basic function to run @ref NEChannelExtractKernel to perform channel extraction. */
-class NEChannelExtract : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input   The input tensor to extract the channel from. Formats supported: Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel The channel to extract.
-     * @param[out] output  The extracted channel. Format supported: U8
-     */
-    void configure(const ITensor *input, Channel channel, ITensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input   The multi-planar input image to extract channel from. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel The channel to extract.
-     * @param[out] output  The extracted channel. Format supported: U8
-     */
-    void configure(const IMultiImage *input, Channel channel, IImage *output);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECHANNELEXTRACT_H*/
diff --git a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
index 716518a8da..bc19e1a4af 100644
--- a/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,12 +24,14 @@
 #ifndef ARM_COMPUTE_NECHANNELSHUFFLELAYER_H
 #define ARM_COMPUTE_NECHANNELSHUFFLELAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEChannelShuffleLayerKernel
  *
@@ -42,6 +44,15 @@ class NEChannelShuffleLayer : public INESimpleFunctionNoBorder
 public:
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - NCHW
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input      Input tensor. Data types supported: All
      * @param[out] output     Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
diff --git a/arm_compute/runtime/NEON/functions/NECol2Im.h b/arm_compute/runtime/NEON/functions/NECol2Im.h
deleted file mode 100644
index 5da0b91766..0000000000
--- a/arm_compute/runtime/NEON/functions/NECol2Im.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOL2IM_H
-#define ARM_COMPUTE_NECOL2IM_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NECol2Im */
-class NECol2Im : public INESimpleFunctionNoBorder
-{
-public:
-    /** Configure the col2im NEON kernel
-     *
-     * @param[in]  input          The input tensor to convert. Data types supported: All
-     * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in]  convolved_dims Output convolved dimensions.
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
-    /** Static function to check if given info will lead to a valid configuration of @ref NECol2Im
-     *
-     * @param[in] input          The input tensor to convert. Data types supported: All
-     * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
-     * @param[in] convolved_dims Output convolved dimensions.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECOL2IM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEColorConvert.h b/arm_compute/runtime/NEON/functions/NEColorConvert.h
deleted file mode 100644
index ee76db2787..0000000000
--- a/arm_compute/runtime/NEON/functions/NEColorConvert.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOLORCONVERT_H
-#define ARM_COMPUTE_NECOLORCONVERT_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-class IMultiImage;
-using IImage = ITensor;
-
-/**Basic function to run @ref NEColorConvertKernel to perform color conversion */
-class NEColorConvert : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const IMultiImage *input, IImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const IImage *input, IMultiImage *output);
-    /** Initialize the function's source, destination
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const IMultiImage *input, IMultiImage *output);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOLORCONVERT_H*/
diff --git a/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h b/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
deleted file mode 100644
index 09c0c9d985..0000000000
--- a/arm_compute/runtime/NEON/functions/NEComputeAllAnchors.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECOMPUTEALLANCHORS_H
-#define ARM_COMPUTE_NECOMPUTEALLANCHORS_H
-
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEComputeAllAnchorsKernel.
- *
- * This function calls the following NEON kernels:
- * -# @ref NEComputeAllAnchorsKernel
- */
-class NEComputeAllAnchors : public INESimpleFunction
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  anchors     Source tensor. Original set of anchors of size (4, A) where A is the number of anchors. Data types supported: F16/F32
-     * @param[out] all_anchors Destination tensor. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in]  info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     */
-    void configure(const ITensor *anchors, ITensor *all_anchors, const ComputeAnchorsInfo &info);
-
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComputeAllAnchorsKernel
-     *
-     * @param[in] anchors     Source tensor info. Original set of anchors of size (4, A) where A is the number of anchors. Data types supported: F16/F32
-     * @param[in] all_anchors Destination tensor info. Destination anchors of size (4, H*W*A) where H and W are the height and width of the feature map and A is the number of anchors. Data types supported: Same as @p input
-     * @param[in] info        Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
-     *
-     * @return a Status
-     */
-    static Status validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECOMPUTEALLANCHORS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
index 8207589680..1600f85488 100644
--- a/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConcatenateLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,10 @@
 #ifndef ARM_COMPUTE_NECONCATENATELAYER_H
 #define ARM_COMPUTE_NECONCATENATELAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Requires.h"
+#include "arm_compute/runtime/IFunction.h"
 
 #include <memory>
-#include <vector>
 
 namespace arm_compute
 {
@@ -40,33 +36,49 @@ class ITensor;
 class ITensorInfo;
 class Status;
 
-/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
- *
- * -# @ref NEWidthConcatenateLayerKernel (if underlying concatenation axis is 0).
- * -# @ref NEHeightConcatenateLayerKernel (if underlying concatenation axis is 1).
- * -# @ref NEDepthConcatenateLayerKernel (if underlying concatenation axis is 2).
- * -# @ref NEBatchConcatenateLayerKernel (if underlying concatenation axis is 3).
- */
+/** Basic function to execute concatenate tensors along a given axis */
 class NEConcatenateLayer : public IFunction
 {
 public:
     /** Default constructor */
     NEConcatenateLayer();
+    /** Destructor */
+    ~NEConcatenateLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConcatenateLayer(const NEConcatenateLayer &) = delete;
+    /** Default move constructor */
+    NEConcatenateLayer(NEConcatenateLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConcatenateLayer &operator=(const NEConcatenateLayer &) = delete;
+    /** Default move assignment operator */
+    NEConcatenateLayer &operator=(NEConcatenateLayer &&);
     /** Initialise the kernel's inputs vector and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayerKernel, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref cpu::kernels::CpuConcatenateWidthKernel, @ref cpu::kernels::CpuConcatenateHeightKernel,
+     *       @ref cpu::kernels::CpuConcatenateDepthKernel and @ref cpu::kernels::CpuConcatenateBatchKernel.
      *
      * @param[in,out] inputs_vector The vectors containing all the tensors to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]    output        Output tensor. Data types supported: Same as @p input.
      * @param[in]     axis          Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(std::vector<ITensor *> inputs_vector, ITensor *output, size_t axis);
     void configure(std::vector<const ITensor *> inputs_vector, ITensor *output, size_t axis);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConcatenateLayer
      *
      * @note Input and output tensor dimensions preconditions defer depending on the concatenation axis.
-     * @note Preconditions can be found respectively at @ref NEWidthConcatenateLayerKernel, @ref NEHeightConcatenateLayerKernel and @ref NEDepthConcatenateLayerKernel.
+     * @note Preconditions can be found respectively at @ref cpu::kernels::CpuConcatenateWidthKernel, @ref cpu::kernels::CpuConcatenateHeightKernel,
+     *       @ref cpu::kernels::CpuConcatenateDepthKernel and @ref cpu::kernels::CpuConcatenateBatchKernel.
      *
      * @param[in] inputs_vector The vectors containing all the tensors info to concatenate. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output        Output tensor info. Data types supported: Same as @p input.
@@ -74,23 +86,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const std::vector<ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
-    static Status validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
+    static Status
+    validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    template <typename TensorType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorType>::type, ITensor>::value)>
-    void configure_internal(std::vector<TensorType *> &&inputs_vector, ITensor *output, size_t axis);
-
-    template <typename TensorInfoType, REQUIRES_TA(std::is_same<typename std::remove_cv<TensorInfoType>::type, ITensorInfo>::value)>
-    static Status validate_internal(const std::vector<TensorInfoType *> &inputs_vector, const ITensorInfo *output, size_t axis);
-
-private:
-    std::vector<std::unique_ptr<INEKernel>> _concat_kernels;
-    unsigned int                            _num_inputs;
-    unsigned int                            _axis;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONCATENATELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConv3D.h b/arm_compute/runtime/NEON/functions/NEConv3D.h
new file mode 100644
index 0000000000..525f37f3e7
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEConv3D.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NECONV3D_H
+#define ARM_COMPUTE_NECONV3D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** Basic function to simulate a 3d convolution. This function calls one of the following functions:
+ * -# @ref cpu::CpuDirectConv3d
+ *
+ */
+class NEConv3D : public IFunction
+{
+public:
+    /** Constructor */
+    NEConv3D();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConv3D(const NEConv3D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConv3D &operator=(const NEConv3D &) = delete;
+    /** Default move constructor */
+    NEConv3D(NEConv3D &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConv3D &operator=(NEConv3D &&) = default;
+    /** Default destructor */
+    ~NEConv3D();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  input     Source tensor. 4 lower dimensions represent a single input [IFM, width, height, depth],
+     *                       while every optional dimension from 5 and above represent a batch of inputs.
+     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_x, kernel_y, kernel_z].
+     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[out] output    Destination tensor. 4 lower dimensions represent a single output [OFM, width, height, depth], while the rest represent batch of outputs.
+     * @param[in]  conv_info Contains padding, stride, acitvation information described in @ref Conv3dInfo.
+     */
+    void configure(
+        ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to NEConv3D::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv3dInfo  &conv_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NECONV3D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
index 42f787090e..dc6b22d717 100644
--- a/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
+++ b/arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,31 +24,49 @@
 #ifndef ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 #define ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H
 
-#include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/ITransformWeights.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEConvertFullyConnectedWeightsKernel. */
+/** Basic function to run @ref cpu::kernels::CpuConvertFullyConnectedWeightsKernel. */
 class NEConvertFullyConnectedWeights : public IFunction
 {
 public:
     /** Default constructor */
     NEConvertFullyConnectedWeights();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvertFullyConnectedWeights(const NEConvertFullyConnectedWeights &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvertFullyConnectedWeights &operator=(const NEConvertFullyConnectedWeights &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvertFullyConnectedWeights(NEConvertFullyConnectedWeights &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvertFullyConnectedWeights &operator=(NEConvertFullyConnectedWeights &&) = delete;
+    /** Default destructor */
+    ~NEConvertFullyConnectedWeights();
     /** Initialize the function.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: All.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer).
      * @param[in]  data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    void
+    configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeights
      *
      * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
@@ -58,53 +76,17 @@ public:
      *
      * @return A Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overriden:
     void run() override;
 
 private:
-    NEConvertFullyConnectedWeightsKernel _kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to run @ref NEConvertFullyConnectedWeightsKernel. */
-class NEConvertFullyConnectedWeightsManaged : public ITransformWeights
-{
-public:
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    uint32_t uid() override
-    {
-        return _uid;
-    }
-
-    void configure(const ITensor *input, const TensorShape &original_input_shape, DataLayout data_layout)
-    {
-        _func.configure(input, &_output, original_input_shape, data_layout);
-    }
-
-private:
-    static constexpr uint32_t      _uid = 0x4;
-    Tensor                         _output{};
-    NEConvertFullyConnectedWeights _func{};
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-} // namespace weights_transformations
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NECONVERTFULLYCONNECTEDWEIGHTS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolution.h b/arm_compute/runtime/NEON/functions/NEConvolution.h
deleted file mode 100644
index c297589013..0000000000
--- a/arm_compute/runtime/NEON/functions/NEConvolution.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECONVOLUTION_H
-#define ARM_COMPUTE_NECONVOLUTION_H
-
-#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute convolution of size 3x3. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEConvolution3x3Kernel
- *
- */
-class NEConvolution3x3 : public INESimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8/S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-
-/** Basic function to execute convolution of size 5x5, 7x7, 9x9. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEConvolutionKernel or<br/>
- *    @ref NESeparableConvolutionHorKernel and @ref NESeparableConvolutionVertKernel (if convolution matrix is separable)
- *
- */
-template <unsigned int matrix_size>
-class NEConvolutionSquare : public IFunction
-{
-public:
-    /** Default constructor */
-    NEConvolutionSquare(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                   _memory_group;   /**< Function memory group */
-    Tensor                                        _tmp;            /**< temporary buffer for output of horizontal pass */
-    bool                                          _is_separable;   /**< true if the convolution can be separated */
-    NESeparableConvolutionHorKernel<matrix_size>  _kernel_hor;     /**< kernel for horizontal pass of separated convolution */
-    NESeparableConvolutionVertKernel<matrix_size> _kernel_vert;    /**< kernel for vertical pass of separated convolution */
-    NEConvolutionKernel<matrix_size>              _kernel;         /**< kernel for non-separated convolution **/
-    NEFillBorderKernel                            _border_handler; /**< kernel for border handling */
-};
-
-/** Basic function to run 5x5 convolution. */
-using NEConvolution5x5 = NEConvolutionSquare<5>;
-/** Basic function to run 7x7 convolution. */
-using NEConvolution7x7 = NEConvolutionSquare<7>;
-/** Basic function to run 9x9 convolution. */
-using NEConvolution9x9 = NEConvolutionSquare<9>;
-
-/** Basic function to execute non-square convolution. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEConvolutionRectangleKernel or<br/>
- *
- * @note Convolution rectangle should have dimensions of 3, 5, 7, 9
- */
-class NEConvolutionRectangle : public INESimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in,out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]    output                Destination tensor, Data types supported: U8 or S16.
-     * @param[in]     conv                  Matrix_size x matrix_size S16 coefficients structured as a row-major 2D array in a linear buffer.
-     * @param[in]     rows                  Rows of convolution kernel.
-     * @param[in]     cols                  Columns of convolution kernel.
-     * @param[in]     scale                 Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0.
-     * @param[in]     border_mode           Strategy to use for borders.
-     * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const int16_t *conv, uint32_t rows, uint32_t cols, uint32_t scale, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NECONVOLUTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index b76695b80c..2d07980ade 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,27 +21,26 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NECONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NECONVOLUTIONLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
 
+#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
 #include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
 
-/** Basic function to simulate a convolution layer. This function calls one of the following NEON functions:
- * -# @ref NEGEMMConvolutionLayer     (executed only in case GEMM is required for the operation)
- * -# @ref NEWinogradConvolutionLayer (executed only in case Winograd is required for the operation)
- * -# @ref NEDirectConvolutionLayer   (executed only in case Direct Convolution is required for the operation)
+/** Basic function to simulate a convolution layer. This function calls one of the following functions:
+ * -# @ref cpu::CpuGemmConv2d     (executed only in case GEMM is required for the operation)
+ * -# @ref cpu::CpuWinogradConv2d (executed only in case Winograd is required for the operation)
+ * -# @ref cpu::CpuDirectConv2d   (executed only in case Direct Convolution is required for the operation)
  * -# @ref NEFFTConvolutionLayer      (executed only in case FFT is required for the operation)
  *
  *
@@ -75,41 +74,74 @@ class NEConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionLayer(const NEConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEConvolutionLayer &operator=(const NEConvolutionLayer &) = delete;
+    /** Default move constructor */
+    NEConvolutionLayer(NEConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEConvolutionLayer &operator=(NEConvolutionLayer &&) = default;
+    /** Default destructor */
+    ~NEConvolutionLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     *                              Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                              tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                             Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     *                             Data type supported: Same as @p input, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                             tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
@@ -118,20 +150,28 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported:Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                             tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
@@ -139,15 +179,21 @@ public:
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    std::shared_ptr<IMemoryManager> _memory_manager;
-    std::unique_ptr<IFunction>      _function; /**< Function to run */
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
-#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */
-\ No newline at end of file
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NECONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NECopy.h b/arm_compute/runtime/NEON/functions/NECopy.h
index b03f408ad8..840c03e968 100644
--- a/arm_compute/runtime/NEON/functions/NECopy.h
+++ b/arm_compute/runtime/NEON/functions/NECopy.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,18 +25,41 @@
 #define ARM_COMPUTE_NECOPY_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NECopyKernel */
-class NECopy : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::kernels::CpuCopyKernel */
+class NECopy : public IFunction
 {
 public:
+    /** Default Constructor */
+    NECopy();
+    /** Default Destructor */
+    ~NECopy();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECopy(const NECopy &) = delete;
+    /** Default move constructor */
+    NECopy(NECopy &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NECopy &operator=(const NECopy &) = delete;
+    /** Default move assignment operator */
+    NECopy &operator=(NECopy &&);
     /** Initialise the function's source and destination.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  Source tensor. Data types supported: All
      * @param[out] output Output tensor. Data types supported: Same as @p input.
      *
@@ -50,6 +73,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NECOPY_H */
diff --git a/arm_compute/runtime/NEON/functions/NECropResize.h b/arm_compute/runtime/NEON/functions/NECropResize.h
index 1c15beded4..f806762158 100644
--- a/arm_compute/runtime/NEON/functions/NECropResize.h
+++ b/arm_compute/runtime/NEON/functions/NECropResize.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEON_CROP_RESIZE_H
 #define ARM_COMPUTE_NEON_CROP_RESIZE_H
 
-#include "arm_compute/core/NEON/kernels/NECropKernel.h"
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
 #include <memory>
@@ -32,7 +31,9 @@
 namespace arm_compute
 {
 // Forward Declarations
+class Tensor;
 class ITensor;
+class NECropKernel;
 
 /** Function to perform cropping and resizing */
 class NECropResize : public IFunction
@@ -49,10 +50,18 @@ public:
     /** Allow instances of this class to be moved */
     NECropResize &operator=(NECropResize &&) = default;
     /** Default destructor */
-    virtual ~NECropResize() = default;
+    ~NECropResize();
 
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |src2   |dst      |
+     * |:--------|:--------|:------|:--------|
+     * |All      |F32      |F32    |F32      |
+     *
      * @note Supported tensor rank: up to 4
      * @note Box indices may be outside of the bounds, in which case @p extrapolation_value is used.
      * @note Start and end indices of boxes are inclusive.
@@ -66,8 +75,13 @@ public:
      * @param[in]  method              The policy to be used when resizing image. Default is bilinear.
      * @param[in]  extrapolation_value Value to be used for values outside of the image for cropping and resizing. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                   InterpolationPolicy method = InterpolationPolicy::BILINEAR, float extrapolation_value = 0);
+    void configure(const ITensor      *input,
+                   const ITensor      *boxes,
+                   const ITensor      *box_ind,
+                   ITensor            *output,
+                   Coordinates2D       crop_size,
+                   InterpolationPolicy method              = InterpolationPolicy::BILINEAR,
+                   float               extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NESlice
      *
@@ -87,8 +101,13 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                           Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value);
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *boxes,
+                           const ITensorInfo  *box_ind,
+                           const ITensorInfo  *output,
+                           Coordinates2D       crop_size,
+                           InterpolationPolicy method,
+                           float               extrapolation_value);
 
     void run() override;
 
diff --git a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
index e2ed0e0abc..aabe42f928 100644
--- a/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDeconvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,15 +24,14 @@
 #ifndef ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEDECONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
-#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEReverse.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -64,11 +63,10 @@ namespace arm_compute
  * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution. Therefore, it will be necessary to use the weights in the
  * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
  *
- * This function calls the following NEON kernels/functions:
+ * This function calls the following kernels/functions:
  *
  * -# @ref CPPUpsample
  * -# @ref NEConvolutionLayer
- * -# @ref NEPermute
  * -# @ref NEReverse
  *
  */
@@ -77,39 +75,77 @@ class NEDeconvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEDeconvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDeconvolutionLayer(const NEDeconvolutionLayer &) = delete;
+    /** Default move constructor */
+    NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDeconvolutionLayer &operator=(const NEDeconvolutionLayer &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDeconvolutionLayer(NEDeconvolutionLayer &&) = default;
-    /** Allow instances of this class to be moved */
+    /** Default move assignment operator */
     NEDeconvolutionLayer &operator=(NEDeconvolutionLayer &&) = default;
     /** Default destructor */
-    virtual ~NEDeconvolutionLayer() = default;
+    ~NEDeconvolutionLayer() = default;
 
     /** Set the input, weights, biases and output tensors.
      *
-     * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-     * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-     * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p input.
-     * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in,out] input            Input tensor. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
+     *                                 Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+     * @param[in]     weights          The 4d weights with dimensions [width, height, IFM, OFM].
+     *                                 Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]     bias             Optional, ignored if NULL. The biases have one dimension.
+     *                                 Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+     * @param[out]    output           Output tensor. The output has the same number of dimensions as the @p input.
+     * @param[in]     info             Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
+     * @param[in]     enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                                 available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]     weights_info     (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+     *                                 the GEMM convolution.
      *
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info);
+    void configure(ITensor             *input,
+                   const ITensor       *weights,
+                   const ITensor       *bias,
+                   ITensor             *output,
+                   const PadStrideInfo &info,
+                   bool                 enable_fast_math = false,
+                   const WeightsInfo   &weights_info     = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDeconvolutionLayer
      *
-     * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
-     * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
-     * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p input.
-     * @param[in] info    Contains padding and policies to be used in the deconvolution, this is decribed in @ref PadStrideInfo.
+     * @param[in] input            Input tensor info. 3 lower dimensions represent a single input, and an optional 4th dimension for batch of inputs.
+     *                             Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
+     * @param[in] weights          The 4d weights info with dimensions [width, height, IFM, OFM].
+     *                             Data type supported: Same as @p input, also could be QSYMM8_PER_CHANNEL if input is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] bias             (Optional) The biases have one dimension. Data type supported: Data types supported: S32 for QASYMM8/QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
+     * @param[in] output           Output tensor info. The output has the same number of dimensions as the @p input.
+     * @param[in] info             Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] weights_info     (Optional) Specifies the weight format. Default is unspecified. This parameter can be used to specify the weight format that is optimal for
+     *                             the GEMM convolution.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &info,
+                           bool                 enable_fast_math = false,
+                           const WeightsInfo   &weights_info     = WeightsInfo());
 
     // Inherited methods overridden:
     void run() override;
@@ -127,6 +163,7 @@ private:
     ITensor           *_input;
     PadStrideInfo      _info;
     bool               _is_prepared;
+    bool               _do_upsampling;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDECONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
index b784480887..7bfdfbd13d 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,35 +25,48 @@
 #define ARM_COMPUTE_NEDEPTHCONVERT_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/**Basic function to run @ref NEDepthConvertLayerKernel */
-class NEDepthConvertLayer : public INESimpleFunctionNoBorder
+/**Basic function to run @ref cpu::kernels::CpuCastKernel */
+class NEDepthConvertLayer : public IFunction
 {
 public:
-    /* Contructor */
-    NEDepthConvertLayer() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
+    /** Constructor */
+    NEDepthConvertLayer();
+    /** Destructor */
+    ~NEDepthConvertLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDepthConvertLayer(const NEDepthConvertLayer &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers)*/
-    const NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete;
+    /** Default move constructor */
+    NEDepthConvertLayer(NEDepthConvertLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthConvertLayer &operator=(const NEDepthConvertLayer &) = delete;
+    /** Default move assignment operator */
+    NEDepthConvertLayer &operator=(NEDepthConvertLayer &&);
     /** Initialize the function's source, destination
      *
-     * Valid conversions Input -> Output :
+     * Valid data layouts:
+     * - All
      *
-     *   - QASYMM8  -> F16, F32
-     *   - U8       -> U16, S16, S32
-     *   - U16      -> U8, U32
-     *   - S16      -> U8, S32
-     *   - BFLOAT16 -> F32
-     *   - F16      -> QASYMM8, F32
-     *   - F32      -> QASYMM8, F16, BFLOAT16
+     * Valid data type configurations:
+     * |src            |dst                        |
+     * |:--------------|:--------------------------|
+     * |QASYMM8        | F16, F32                  |
+     * |U8             | U16, S16, S32             |
+     * |U16            | U8, U32                   |
+     * |S16            | U8, S32                   |
+     * |BFLOAT16       | F32                       |
+     * |F16            | QASYMM8, F32              |
+     * |F32            | QASYMM8, F16, BFLOAT16    |
+     *
+     * Input data type must be different than output data type.
      *
      * @param[in]  input  The input tensor to convert. Data types supported: QASYMM8/U8/U16/S16/BFLOAT16/F16/F32.
      * @param[out] output The output tensor. Data types supported: QASYMM8/U8/U16/S16/U32/S32/BFLOAT16/F16/F32.
@@ -70,7 +83,15 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift = 0);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEDEPTHCONVERT_H*/
diff --git a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
index 3c21d1a33a..d27369670e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,26 +21,48 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
-#define ARM_COMPUTE_NEDEPTHTOSPACELAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+class NEDepthToSpaceLayerKernel;
 
 /** Basic function to run @ref NEDepthToSpaceLayerKernel. */
-class NEDepthToSpaceLayer : public INESimpleFunctionNoBorder
+class NEDepthToSpaceLayer : public IFunction
 {
 public:
+    /** Constructor */
+    NEDepthToSpaceLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthToSpaceLayer(const NEDepthToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDepthToSpaceLayer &operator=(const NEDepthToSpaceLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDepthToSpaceLayer(NEDepthToSpaceLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDepthToSpaceLayer &operator=(NEDepthToSpaceLayer &&) = delete;
+    /** Default destructor */
+    ~NEDepthToSpaceLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All
      * @param[out] output      Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape Block shape value.
@@ -55,6 +77,11 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
+
+    void run() override;
+
+private:
+    std::unique_ptr<NEDepthToSpaceLayerKernel> _kernel;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHTOSPACELAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEDEPTHTOSPACELAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index 811dc82843..6ad5aa7bfa 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,18 +24,18 @@
 #ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H
 #define ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H
 
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
-#include "arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class NEDepthwiseConvolutionLayerNativeKernel;
 
 /** Function to execute a depthwise convolution.
  */
@@ -52,8 +52,24 @@ public:
     NEDepthwiseConvolutionLayer &operator=(const NEDepthwiseConvolutionLayer &) = delete;
     /** Default move assignment operator */
     NEDepthwiseConvolutionLayer &operator=(NEDepthwiseConvolutionLayer &&) = default;
+    /** Default destructor */
+    ~NEDepthwiseConvolutionLayer();
     /** Initialize the function's source, destination, weights and convolution information.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
      * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
      * @param[out]     output           Destination tensor. Data type supported: same as @p input.
      * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
@@ -65,8 +81,14 @@ public:
      * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   unsigned int               depth_multiplier = 1,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U));
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
      *
@@ -83,40 +105,27 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           unsigned int               depth_multiplier = 1,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U));
 
     // Inherited methods overriden:
     void run() override;
     void prepare() override;
 
 private:
-    /** Static function to choose the best depthwise convolution function for @ref NEDepthwiseConvolutionLayer
-     *
-     * @param[in] input            Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
-     * @param[in] weights          Weights tensor info. These are 3D tensors with shape [kernel_x, kernel_y, IFM].
-     *                             Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases           Biases tensor info. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                             Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in] conv_info        Padding and stride information to use for the convolution.
-     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU for 3x3 quantized are supported.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return a Depthwise Convolution Function
-     */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                                          const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1,
-                                                                          ActivationLayerInfo act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
-
-    /** Basic function to execute optimized depthwise convolution routines. This function calls the following NEON kernels:
+    /** Basic function to execute optimized depthwise convolution routines. This function calls the following kernels:
     *
     * @note At the moment 3x3 and 5x5 convolution of stride 1, 2 are supported
     *
     * -# @ref NEFillBorderKernel (if pad_x or pad_y > 0) and no assembly kernel implementation is present
     * -# @ref NEDepthwiseConvolutionLayer3x3Kernel if 3x3 and no assembly kernel implementation is present
-    * -# @ref NEDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
+    * -# @ref cpu::CpuDepthwiseConvolutionAssemblyDispatch if assembly kernel implementation is present
     * -# @ref NEDirectConvolutionLayerOutputStageKernel if re-quantization of output is required
     * -# @ref NEActivationLayer if fused activation is required
     *
@@ -131,9 +140,13 @@ private:
         /** Default move constructor */
         NEDepthwiseConvolutionLayerOptimizedInternal(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
         /** Prevent instances of this class from being copied (As this class contains pointers) */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(const NEDepthwiseConvolutionLayerOptimizedInternal &) = delete;
         /** Default move assignment operator */
-        NEDepthwiseConvolutionLayerOptimizedInternal &operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+        NEDepthwiseConvolutionLayerOptimizedInternal &
+        operator=(NEDepthwiseConvolutionLayerOptimizedInternal &&) = default;
+        /** Default destructor */
+        ~NEDepthwiseConvolutionLayerOptimizedInternal() = default;
         /** Initialize the function's source, destination, kernels and border_size.
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
@@ -146,8 +159,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
          *
@@ -163,71 +182,26 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
         void prepare() override;
 
     private:
-        /** Configure the kernels/functions for the generic pipeline.
-         *
-         * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]      weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]      biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                                  Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out]     output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]      conv_info        Padding and stride information to use for the convolution.
-         * @param[in]      depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]      act_info         Activation layer information in case of a fused activation.
-         * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-         *
-         */
-        void configure_generic(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
-        /** Configure the kernels/functions for the optimized pipeline.
-         *
-         * @param[in]  input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
-         * @param[in]  weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
-         * @param[in]  biases           Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-         *                              Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-         * @param[out] output           Destination tensor. Data type supported: same as @p input.
-         * @param[in]  conv_info        Padding and stride information to use for the convolution.
-         * @param[in]  depth_multiplier Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-         * @param[in]  act_info         Activation layer information in case of a fused activation.
-         */
-        void configure_optimized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation = Size2D(1U, 1U));
-        /** Run generic kernel */
-        void run_generic();
-        /** Run optimized function */
-        void run_optimized();
-
-        MemoryGroup                               _memory_group;
-        NEDepthwiseConvolutionLayer3x3Kernel      _dwc_kernel;
-        NEDepthwiseConvolutionAssemblyDispatch    _dwc_optimized_func;
-        NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
-        NEFillBorderKernel                        _border_handler;
-        NEPermute                                 _permute_input;
-        NEPermute                                 _permute_weights;
-        NEPermute                                 _permute_output;
-        NEActivationLayer                         _activationlayer_function;
-        Tensor                                    _accumulator;
-        Tensor                                    _permuted_input;
-        Tensor                                    _permuted_weights;
-        Tensor                                    _permuted_output;
-        const ITensor                            *_original_weights;
-        bool                                      _has_bias;
-        bool                                      _is_quantized;
-        bool                                      _is_optimized;
-        bool                                      _is_nchw;
-        bool                                      _permute;
-        bool                                      _is_activationlayer_enabled;
-        bool                                      _is_prepared;
+        MemoryGroup _memory_group;
+        struct Impl;
+        std::unique_ptr<Impl> _impl;
     };
 
-    /** Basic function to execute a generic depthwise convolution. This function calls the following NEON kernel:
+    /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
      *
      * -# @ref NEDepthwiseConvolutionLayerNativeKernel
      *
@@ -245,6 +219,8 @@ private:
         NEDepthwiseConvolutionLayerGeneric &operator=(const NEDepthwiseConvolutionLayerGeneric &) = delete;
         /** Default move assignment operator */
         NEDepthwiseConvolutionLayerGeneric &operator=(NEDepthwiseConvolutionLayerGeneric &&) = default;
+        /** Default destructor */
+        ~NEDepthwiseConvolutionLayerGeneric() = default;
         /** Initialize the function's source, destination, weights and convolution information.
          *
          * @param[in, out] input            Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. (Written to only for border filling).
@@ -258,8 +234,14 @@ private:
          * @param[in]      act_info         (Optional) Activation layer information in case of a fused activation.
          * @param[in]      dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
          */
-        void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                       unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        void configure(ITensor                   *input,
+                       const ITensor             *weights,
+                       const ITensor             *biases,
+                       ITensor                   *output,
+                       const PadStrideInfo       &conv_info,
+                       unsigned int               depth_multiplier = 1,
+                       const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                       const Size2D              &dilation         = Size2D(1U, 1U));
 
         /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayerGeneric
          *
@@ -276,32 +258,25 @@ private:
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                               unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(), const Size2D &dilation = Size2D(1U, 1U));
+        static Status validate(const ITensorInfo         *input,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               unsigned int               depth_multiplier = 1,
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U));
 
         // Inherited methods overriden:
         void run() override;
-        void prepare() override;
 
     private:
-        NEDepthwiseConvolutionLayerNativeKernel _depthwise_conv_kernel;
-        NEFillBorderKernel                      _fill_border;
-        NEPermute                               _permute_input;
-        NEPermute                               _permute_weights;
-        NEPermute                               _permute_output;
-        NEActivationLayer                       _activationlayer_function;
-        Tensor                                  _permuted_input;
-        Tensor                                  _permuted_weights;
-        Tensor                                  _permuted_output;
-        bool                                    _is_prepared;
-        bool                                    _is_nchw;
-        bool                                    _is_activationlayer_enabled;
-        const ITensor                          *_original_weights;
+        struct Impl;
+        std::unique_ptr<Impl> _impl;
     };
-
-    DepthwiseConvolutionFunction                 _depth_conv_func;
-    NEDepthwiseConvolutionLayerOptimizedInternal _func_optimized;
-    NEDepthwiseConvolutionLayerGeneric           _func_generic;
+    MemoryGroup _memory_group;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H */
-\ No newline at end of file
+#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
index f8d0ce8b2d..8b49930ef5 100644
--- a/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDequantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,33 +24,66 @@
 #ifndef ARM_COMPUTE_NEDEQUANTIZATIONLAYER_H
 #define ARM_COMPUTE_NEDEQUANTIZATIONLAYER_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEDequantizationLayerKernel that dequantizes an input tensor */
-class NEDequantizationLayer : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::CpuDequantize that dequantizes an input tensor */
+class NEDequantizationLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEDequantizationLayer();
+    /** Default Destructor */
+    ~NEDequantizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDequantizationLayer(const NEDequantizationLayer &) = delete;
+    /** Default move constructor */
+    NEDequantizationLayer(NEDequantizationLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDequantizationLayer &operator=(const NEDequantizationLayer &) = delete;
+    /** Default move assignment operator */
+    NEDequantizationLayer &operator=(NEDequantizationLayer &&) = default;
     /** Configure the kernel.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst         |
+     * |:------------------|:-----------|
+     * |QASYMM8            |F16, F32    |
+     * |QASYMM8_SIGNED     |F16, F32    |
+     * |QSYMM8_PER_CHANNEL |F16, F32    |
+     * |QSYMM8             |F16, F32    |
+     * |QSYMM16            |F16, F32    |
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[out] output Destination tensor with the same dimensions of input. Data type supported: F16/F32.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDequantizationLayer
      *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
      * @param[in] output Output tensor info. Data type supported: F16/F32.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDEQUANTIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDerivative.h b/arm_compute/runtime/NEON/functions/NEDerivative.h
deleted file mode 100644
index 65d0654612..0000000000
--- a/arm_compute/runtime/NEON/functions/NEDerivative.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDERIVATIVE_H
-#define ARM_COMPUTE_NEDERIVATIVE_H
-
-#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute first order derivative operator. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEDerivativeKernel
- *
- */
-class NEDerivative : public IFunction
-{
-public:
-    /** Default constructor */
-    NEDerivative();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_x              (optional) Destination tensor. Derivative along the X direction. Data type supported: S16.
-     * @param[out]     output_y              (optional) Destination tensor. Derivative along the Y direction. Data type supported: S16.
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEDerivativeKernel _kernel;         /**< Derivative kernel */
-    NEFillBorderKernel _border_handler; /**< Kernel to handle tensor borders */
-};
-}
-#endif /* ARM_COMPUTE_NEDERIVATIVE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
index d616762a5a..7a94833d10 100644
--- a/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDetectionPostProcessLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,12 @@
 #ifndef ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 #define ARM_COMPUTE_NE_DETECTION_POSTPROCESS_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/INESimpleFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <map>
@@ -53,11 +52,23 @@ public:
     NEDetectionPostProcessLayer(const NEDetectionPostProcessLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEDetectionPostProcessLayer &operator=(const NEDetectionPostProcessLayer &) = delete;
+    /** Default destructor */
+    ~NEDetectionPostProcessLayer() = default;
     /** Configure the detection output layer NE function
      *
-     * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: F32, QASYMM8.
-     * @param[in]  input_score        The class prediction input tensor. Data types supported: Same as @p input_box_encoding.
-     * @param[in]  input_anchors      The anchors input tensor. Data types supported: Same as @p input_box_encoding.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src2    |dst0 - dst3    |
+     * |:--------------|:--------------|
+     * |QASYMM8        |F32            |
+     * |QASYMM8_SIGNED |F32            |
+     * |F32            |F32            |
+     *
+     * @param[in]  input_box_encoding The bounding box input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
+     * @param[in]  input_score        The class prediction input tensor. Data types supported: same as @p input_box_encoding.
+     * @param[in]  input_anchors      The anchors input tensor. Data types supported: same as @p input_box_encoding.
      * @param[out] output_boxes       The boxes output tensor. Data types supported: F32.
      * @param[out] output_classes     The classes output tensor. Data types supported: Same as @p output_boxes.
      * @param[out] output_scores      The scores output tensor. Data types supported: Same as @p output_boxes.
@@ -66,23 +77,34 @@ public:
      *
      * @note Output contains all the detections. Of those, only the ones selected by the valid region are valid.
      */
-    void configure(const ITensor *input_box_encoding, const ITensor *input_score, const ITensor *input_anchors,
-                   ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
+    void configure(const ITensor                *input_box_encoding,
+                   const ITensor                *input_score,
+                   const ITensor                *input_anchors,
+                   ITensor                      *output_boxes,
+                   ITensor                      *output_classes,
+                   ITensor                      *output_scores,
+                   ITensor                      *num_detection,
+                   DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDetectionPostProcessLayer
      *
-     * @param[in]  input_box_encoding The bounding box input tensor info. Data types supported: F32, QASYMM8.
-     * @param[in]  input_class_score  The class prediction input tensor info. Data types supported: F32, QASYMM8.
-     * @param[in]  input_anchors      The anchors input tensor. Data types supported: F32, QASYMM8.
-     * @param[out] output_boxes       The output tensor. Data types supported: F32.
-     * @param[out] output_classes     The output tensor. Data types supported: Same as @p output_boxes.
-     * @param[out] output_scores      The output tensor. Data types supported: Same as @p output_boxes.
-     * @param[out] num_detection      The number of output detection. Data types supported: Same as @p output_boxes.
-     * @param[in]  info               (Optional) DetectionPostProcessLayerInfo information.
+     * @param[in] input_box_encoding The bounding box input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32.
+     * @param[in] input_class_score  The class prediction input tensor info. Data types supported: same as @p input_box_encoding.
+     * @param[in] input_anchors      The anchors input tensor info. Data types supported: same as @p input_box_encoding.
+     * @param[in] output_boxes       The output tensor info. Data types supported: F32.
+     * @param[in] output_classes     The output tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] output_scores      The output tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] num_detection      The number of output detection tensor info. Data types supported: Same as @p output_boxes.
+     * @param[in] info               (Optional) DetectionPostProcessLayerInfo information.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                           ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
+    static Status validate(const ITensorInfo            *input_box_encoding,
+                           const ITensorInfo            *input_class_score,
+                           const ITensorInfo            *input_anchors,
+                           ITensorInfo                  *output_boxes,
+                           ITensorInfo                  *output_classes,
+                           ITensorInfo                  *output_scores,
+                           ITensorInfo                  *num_detection,
                            DetectionPostProcessLayerInfo info = DetectionPostProcessLayerInfo());
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index 68454be88f..3ae3b2a15c 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,35 +24,51 @@
 #ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
+class ITensor;
+class ITensorInfo;
 /** Function to run the direct convolution.
  *
- *  This function calls the following NEON kernels:
+ *  This function calls the following:
  *
- * -# @ref NEFillBorderKernel for the input
- * -# @ref NEDirectConvolutionLayerOutputStageKernel
- * -# @ref NEDirectConvolutionLayerKernel
+ * -# @ref cpu::CpuDirectConv2d
  */
 class NEDirectConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
     NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayer(const NEDirectConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEDirectConvolutionLayer &operator=(const NEDirectConvolutionLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDirectConvolutionLayer(NEDirectConvolutionLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEDirectConvolutionLayer &operator=(NEDirectConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEDirectConvolutionLayer();
     /** Set the input, weights, biases and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |dst    |
+     * |:------|:------|:------|:------|
+     * |F16    |F16    |F16    |F16    |
+     * |F32    |F32    |F32    |F32    |
+     *
      * @note: DirectConvolution only works in the following configurations:
      *    1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
      *    3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3 data type = F16/F32
@@ -69,7 +85,12 @@ public:
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *bias,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayer
      *
      * @note: DirectConvolution only works in the following configurations:
@@ -90,22 +111,20 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                               _memory_group;
-    NEDirectConvolutionLayerOutputStageKernel _output_stage_kernel;
-    NEDirectConvolutionLayerKernel            _conv_kernel;
-    NEFillBorderKernel                        _input_border_handler;
-    NEActivationLayer                         _activationlayer_function;
-    Tensor                                    _accumulator;
-    bool                                      _has_bias;
-    bool                                      _is_activationlayer_enabled;
-    unsigned int                              _dim_split;
+    struct Impl;
+    std::shared_ptr<IMemoryManager> _memory_manager;
+    std::unique_ptr<Impl>           _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index cac105cdb9..ebf2277d1f 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,29 +25,59 @@
 #define ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEArithmeticOperationKernel for max
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for max
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a max operation between two tensors.
  */
-class NEElementwiseMax : public INESimpleFunction
+class NEElementwiseMax : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseMax();
+    /** Default Destructor */
+    ~NEElementwiseMax();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMax(const NEElementwiseMax &) = delete;
+    /** Default move constructor */
+    NEElementwiseMax(NEElementwiseMax &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMax &operator=(const NEElementwiseMax &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseMax &operator=(NEElementwiseMax &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for max
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -56,26 +86,64 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for min
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for min
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a min operation between two tensors.
  */
-class NEElementwiseMin : public INESimpleFunction
+class NEElementwiseMin : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseMin();
+    /** Default Destructor */
+    ~NEElementwiseMin();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMin(const NEElementwiseMin &) = delete;
+    /** Default move constructor */
+    NEElementwiseMin(NEElementwiseMin &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseMin &operator=(const NEElementwiseMin &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseMin &operator=(NEElementwiseMin &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for min
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -84,26 +152,64 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for squared difference
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for squared difference
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
  */
-class NEElementwiseSquaredDiff : public INESimpleFunction
+class NEElementwiseSquaredDiff : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseSquaredDiff();
+    /** Default Destructor */
+    ~NEElementwiseSquaredDiff();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseSquaredDiff(const NEElementwiseSquaredDiff &) = delete;
+    /** Default move constructor */
+    NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseSquaredDiff &operator=(const NEElementwiseSquaredDiff &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseSquaredDiff &operator=(NEElementwiseSquaredDiff &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |S32            |S32            |S32            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for squared difference
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -112,26 +218,60 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for division
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
  *
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = in1[i] / in2[i])
  */
-class NEElementwiseDivision : public INESimpleFunction
+class NEElementwiseDivision : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseDivision();
+    /** Default Destructor */
+    ~NEElementwiseDivision();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseDivision(const NEElementwiseDivision &) = delete;
+    /** Default move constructor */
+    NEElementwiseDivision(NEElementwiseDivision &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseDivision &operator=(const NEElementwiseDivision &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseDivision &operator=(NEElementwiseDivision &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for division
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -140,27 +280,61 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for power
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
  *
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
  * @note For an exponent that is a float, this function will only work with a positive base.
  */
-class NEElementwisePower : public INESimpleFunction
+class NEElementwisePower : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwisePower();
+    /** Default Destructor */
+    ~NEElementwisePower();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwisePower(const NEElementwisePower &) = delete;
+    /** Default move constructor */
+    NEElementwisePower(NEElementwisePower &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwisePower &operator=(const NEElementwisePower &) = delete;
+    /** Default move assignment operator */
+    NEElementwisePower &operator=(NEElementwisePower &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     *
      * @param[in, out] input1   First tensor input. Data types supported: F16/F32.
      * @param[in, out] input2   Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output   Output tensor. Data types supported: Same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for power
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -169,62 +343,125 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEComparisonOperationKernel.
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
  *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
  */
-class NEElementwiseComparison : public INESimpleFunction
+class NEElementwiseComparison : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseComparison();
+    /** Default Destructor */
+    ~NEElementwiseComparison();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparison(const NEElementwiseComparison &) = delete;
+    /** Default move constructor */
+    NEElementwiseComparison(NEElementwiseComparison &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparison &operator=(const NEElementwiseComparison &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseComparison &operator=(NEElementwiseComparison &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst   |
+     * |:--------------|:--------------|:-----|
+     * |QASYMM8        |QASYMM8        |U8    |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |U8    |
+     * |S32            |S32            |U8    |
+     * |U8             |U8             |U8    |
+     * |S16            |S16            |U8    |
+     * |F16            |F16            |U8    |
+     * |F32            |F32            |U8    |
+     *
+     * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor. Data types supported: U16/U32.
+     * @param[out]     output Output tensor. Data types supported: U8.
      * @param[in]      op     Comparison Operation to be performed.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in] output Output tensor info. Data types supported: U8.
      * @param[in] op     Comparison Operation to be performed.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEComparisonOperationKernel
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
  *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
  */
 template <ComparisonOperation op>
-class NEElementwiseComparisonStatic : public INESimpleFunction
+class NEElementwiseComparisonStatic : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEElementwiseComparisonStatic();
+    /** Default Destructor */
+    ~NEElementwiseComparisonStatic();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparisonStatic(const NEElementwiseComparisonStatic &) = delete;
+    /** Default move constructor */
+    NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseComparisonStatic &operator=(const NEElementwiseComparisonStatic &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseComparisonStatic &operator=(NEElementwiseComparisonStatic &&);
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in, out] input1 First tensor input. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input1 First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in, out] input2 Second tensor input. Data types supported: Same as @p input1.
      * @param[out]     output Output tensor. Data types supported: U16/U32.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
      * @param[in] output Output tensor info. Data types supported: U16/U32.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 /** Basic function to run equal comparison. */
@@ -239,5 +476,6 @@ using NEGreaterEqual = NEElementwiseComparisonStatic<ComparisonOperation::Greate
 using NELess = NEElementwiseComparisonStatic<ComparisonOperation::Less>;
 /** Basic function to run less-equal comparison. */
 using NELessEqual = NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
index 094f875b35..63e47b8377 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,150 +24,73 @@
 #ifndef ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H
 #define ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
-
-/** Basic function to perform inverse square root on an input tensor. */
-class NERsqrtLayer : public INESimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Basic function to perform exponential on an input tensor. */
-class NEExpLayer : public INESimpleFunction
+class ITensorInfo;
+/** Basic function to perform unary elementwise operations */
+template <ElementWiseUnary op>
+class NEElementwiseUnaryLayer : public IFunction
 {
 public:
-    /** Initialize the function
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEExpLayer
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
+    /** Default Constructor */
+    NEElementwiseUnaryLayer();
+    /** Default Destructor */
+    ~NEElementwiseUnaryLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseUnaryLayer(const NEElementwiseUnaryLayer &) = delete;
+    /** Default move constructor */
+    NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEElementwiseUnaryLayer &operator=(const NEElementwiseUnaryLayer &) = delete;
+    /** Default move assignment operator */
+    NEElementwiseUnaryLayer &operator=(NEElementwiseUnaryLayer &&);
 
-/** Basic function to negate an input tensor. */
-class NENegLayer : public INESimpleFunction
-{
-public:
     /** Initialize the function
      *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NENegLayer
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
+     * Valid data layouts:
+     * - All
      *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
-/** Basic function to compute the natural logarithm of an input tensor. */
-class NELogLayer : public INESimpleFunction
-{
-public:
-    /** Initialize the function
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
      *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
+     * @param[in]  input  Input tensor. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
+     * @param[out] output Output tensor. Data types supported: Same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELogLayer
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
+     * @param[in] input  Input tensor info. Data types supported: F16/F32, F16/F32/S32 for NEG/ABS operations.
      * @param[in] output Output tensor info. Data types supported: Same as @p input.
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
+    // Inherited methods overridden:
+    void run() override;
 
-/** Basic function to compute the absolute value of an input tensor. */
-class NEAbsLayer : public INESimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEAbsLayer
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to compute the round value elementwise of an input tensor. */
-class NERoundLayer : public INESimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NERoundLayer
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
+using NERsqrtLayer = NEElementwiseUnaryLayer<ElementWiseUnary::RSQRT>;
+using NEExpLayer   = NEElementwiseUnaryLayer<ElementWiseUnary::EXP>;
+using NENegLayer   = NEElementwiseUnaryLayer<ElementWiseUnary::NEG>;
+using NELogLayer   = NEElementwiseUnaryLayer<ElementWiseUnary::LOG>;
+using NEAbsLayer   = NEElementwiseUnaryLayer<ElementWiseUnary::ABS>;
+using NERoundLayer = NEElementwiseUnaryLayer<ElementWiseUnary::ROUND>;
+using NESinLayer   = NEElementwiseUnaryLayer<ElementWiseUnary::SIN>;
 
-/** Basic function to compute the sine of an input tensor. */
-class NESinLayer : public INESimpleFunction
-{
-public:
-    /** Initialize the function
-     *
-     * @param[in]  input  Input tensor. Data types supported: F16/F32.
-     * @param[out] output Output tensor. Data types supported: same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NESinLayer
-     *
-     * @param[in] input  First tensor input info. Data types supported: F16/F32.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEELEMENTWISEUNARYLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h b/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
deleted file mode 100644
index e9d58f3e0c..0000000000
--- a/arm_compute/runtime/NEON/functions/NEEqualizeHistogram.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H
-#define ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H
-
-#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
-#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h"
-#include "arm_compute/runtime/Distribution1D.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/Lut.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Basic function to execute histogram equalization. This function calls the following NEON kernels:
- *
- * -# @ref NEHistogramKernel
- * -# @ref NECumulativeDistributionKernel
- * -# @ref NETableLookupKernel
- *
- */
-class NEEqualizeHistogram : public IFunction
-{
-public:
-    /** Default Constructor. */
-    NEEqualizeHistogram();
-    /** Initialise the kernel's inputs.
-     *
-     * @note Currently the width of the input image must be a multiple of 16.
-     *
-     * @param[in]  input  Input image. Data type supported: U8.
-     * @param[out] output Output image. Data type supported: same as @p input
-     */
-    void configure(const IImage *input, IImage *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEHistogramKernel              _histogram_kernel;        /**< Kernel that calculates the histogram of input. */
-    NECumulativeDistributionKernel _cd_histogram_kernel;     /**< Kernel that calculates the cumulative distribution
-                                                                  and creates the relevant LookupTable. */
-    NETableLookupKernel            _map_histogram_kernel;    /**< Kernel that maps the input to output using the lut. */
-    Distribution1D                 _hist;                    /**< Distribution that holds the histogram of the input image. */
-    Distribution1D                 _cum_dist;                /**< Distribution that holds the cummulative distribution of the input histogram. */
-    Lut                            _cd_lut;                  /**< Holds the equalization lookuptable. */
-    static constexpr uint32_t      nr_bins{ 256 };           /**< Histogram bins of the internal histograms. */
-    static constexpr uint32_t      max_range{ nr_bins - 1 }; /**< Histogram range of the internal histograms. */
-};
-}
-#endif /*ARM_COMPUTE_NEEQUALIZEHISTOGRAM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEErode.h b/arm_compute/runtime/NEON/functions/NEErode.h
deleted file mode 100644
index 1d6ea42e16..0000000000
--- a/arm_compute/runtime/NEON/functions/NEErode.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEERODE_H
-#define ARM_COMPUTE_NEERODE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute erode. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEErodeKernel
- *
- */
-class NEErode : public INESimpleFunction
-{
-public:
-    /** Initialise the kernel's inputs, output and border mode
-     *
-     * @param[in, out] input                 First tensor input. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor. Data type supported: U8.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEERODE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFFT1D.h b/arm_compute/runtime/NEON/functions/NEFFT1D.h
index c710b937b4..99c6fd4eb4 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT1D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT1D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,21 +24,22 @@
 #ifndef ARM_COMPUTE_NEFFT1D_H
 #define ARM_COMPUTE_NEFFT1D_H
 
-#include "arm_compute/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 // Forward declaration
 class ITensor;
+class NEFFTDigitReverseKernel;
+class NEFFTRadixStageKernel;
+class NEFFTScaleKernel;
 
-/** Basic function to execute one dimensional FFT. This function calls the following NEON kernels:
+/** Basic function to execute one dimensional FFT. This function calls the following kernels:
  *
  * -# @ref NEFFTDigitReverseKernel Performs digit reverse
  * -# @ref NEFFTRadixStageKernel   A list of FFT kernels depending on the radix decomposition
@@ -49,8 +50,26 @@ class NEFFT1D : public IFunction
 public:
     /** Default Constructor */
     NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT1D(const NEFFT1D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT1D &operator=(const NEFFT1D &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT1D(NEFFT1D &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT1D &operator=(NEFFT1D &&) = delete;
+    /** Default destructor */
+    ~NEFFT1D();
     /** Initialise the function's source and destinations.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     *
      * @param[in]  input  Source tensor. Data types supported: F32. Number of channels supported: 1 (real tensor) or 2 (complex tensor).
      * @param[out] output Destination tensor.  Data types and data layouts supported: Same as @p input.
      *                    Number of channels supported: 1 (real tensor) or 2 (complex tensor).If @p input is real, @p output must be complex.
@@ -71,15 +90,15 @@ public:
     void run() override;
 
 protected:
-    MemoryGroup                        _memory_group;
-    NEFFTDigitReverseKernel            _digit_reverse_kernel;
-    std::vector<NEFFTRadixStageKernel> _fft_kernels;
-    NEFFTScaleKernel                   _scale_kernel;
-    Tensor                             _digit_reversed_input;
-    Tensor                             _digit_reverse_indices;
-    unsigned int                       _num_ffts;
-    unsigned int                       _axis;
-    bool                               _run_scale;
+    MemoryGroup                                         _memory_group;
+    std::unique_ptr<NEFFTDigitReverseKernel>            _digit_reverse_kernel;
+    std::vector<std::unique_ptr<NEFFTRadixStageKernel>> _fft_kernels;
+    std::unique_ptr<NEFFTScaleKernel>                   _scale_kernel;
+    Tensor                                              _digit_reversed_input;
+    Tensor                                              _digit_reverse_indices;
+    unsigned int                                        _num_ffts;
+    unsigned int                                        _axis;
+    bool                                                _run_scale;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEFFT1D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFFT2D.h b/arm_compute/runtime/NEON/functions/NEFFT2D.h
index e25ebb9e80..cefd3df17a 100644
--- a/arm_compute/runtime/NEON/functions/NEFFT2D.h
+++ b/arm_compute/runtime/NEON/functions/NEFFT2D.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFT2D_H
 #define ARM_COMPUTE_NEFFT2D_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT1D.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -36,7 +35,7 @@ namespace arm_compute
 // Forward declaration
 class ITensor;
 
-/** Basic function to execute two dimensional FFT. This function calls the following NEON kernels:
+/** Basic function to execute two dimensional FFT. This function calls the following kernels:
  *
  * -# @ref NEFFT1D 1D FFT is performed on the first given axis
  * -# @ref NEFFT1D 1D FFT is performed on the second given axis
@@ -46,8 +45,26 @@ class NEFFT2D : public IFunction
 public:
     /** Default Constructor */
     NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT2D(const NEFFT2D &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFFT2D &operator=(const NEFFT2D &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT2D(NEFFT2D &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFT2D &operator=(NEFFT2D &&) = delete;
+    /** Default destructor */
+    ~NEFFT2D();
     /** Initialise the function's source and destinations
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     *
      * @param[in]  input  Source tensor. Data types supported: F32.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  config FFT related configuration
diff --git a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
index 23df459e04..84bfe6b02f 100644
--- a/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,8 @@
 #ifndef ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEFFTCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFFT2D.h"
@@ -43,7 +42,7 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Basic function to execute FFT-based convolution on NEON. This function calls the following NEON functions/kernels:
+/** Basic function to execute FFT-based convolution on CPU. This function calls the following functions/kernels:
  *
  *  -# @ref NEPermute                        Permute input if NHWC(only NCHW is supported).
  *  -# @ref NEPadLayer                       Pad input.
@@ -63,46 +62,68 @@ public:
     NEFFTConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFFTConvolutionLayer(const NEFFTConvolutionLayer &) = delete;
-    /** Default move constructor */
-    NEFFTConvolutionLayer(NEFFTConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFTConvolutionLayer(NEFFTConvolutionLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFFTConvolutionLayer &operator=(const NEFFTConvolutionLayer &) = delete;
-    /** Default move assignment operator */
-    NEFFTConvolutionLayer &operator=(NEFFTConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEFFTConvolutionLayer &operator=(NEFFTConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEFFTConvolutionLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
      *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F32.
-     * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  act_info  (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      */
-    void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFFTConvolutionLayer
      *
      * @note: This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
      *
-     * @param[in] input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs.
-     *                      Data types supported: F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     * @param[in] biases    Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
-     * @param[in] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                      Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] act_info  (Optional) Activation layer information in case of a fused activation.
+     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: F32.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p input
+     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. Unused for CPU backend.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEFastCorners.h b/arm_compute/runtime/NEON/functions/NEFastCorners.h
deleted file mode 100644
index e2decb177b..0000000000
--- a/arm_compute/runtime/NEON/functions/NEFastCorners.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFASTCORNERS_H
-#define ARM_COMPUTE_NEFASTCORNERS_H
-
-#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Basic function to execute fast corners. This function call the following NEON kernels:
- *
- * -# @ref NEFastCornersKernel
- * -# @ref NENonMaximaSuppression3x3Kernel (executed if nonmax_suppression == true)
- * -# @ref NEFillArrayKernel
- *
- */
-class NEFastCorners : public IFunction
-{
-public:
-    /** Constructor */
-    NEFastCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      threshold             Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]      nonmax_suppression    If true, non-maximum suppression is applied to detected corners before being placed in the array.
-     * @param[out]     corners               Array of keypoints to store the results.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(IImage *input, float threshold, bool nonmax_suppression, KeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                     _memory_group;
-    NEFastCornersKernel             _fast_corners_kernel;
-    NEFillBorderKernel              _border_handler;
-    NENonMaximaSuppression3x3Kernel _nonmax_kernel;
-    NEFillArrayKernel               _fill_kernel;
-    Image                           _output;
-    Image                           _suppressed;
-    bool                            _non_max;
-};
-}
-#endif /*ARM_COMPUTE_NEFASTCORNERS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFill.h b/arm_compute/runtime/NEON/functions/NEFill.h
index f8a15078c3..1829c71fef 100644
--- a/arm_compute/runtime/NEON/functions/NEFill.h
+++ b/arm_compute/runtime/NEON/functions/NEFill.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,25 +24,53 @@
 #ifndef ARM_COMPUTE_NEFILL_H
 #define ARM_COMPUTE_NEFILL_H
 
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEMemsetKernel */
-class NEFill : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::kernels::CpuFillKernel */
+class NEFill : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEFill();
+    /** Default Destructor */
+    ~NEFill();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFill(const NEFill &) = delete;
+    /** Default move constructor */
+    NEFill(NEFill &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFill &operator=(const NEFill &) = delete;
+    /** Default move assignment operator */
+    NEFill &operator=(NEFill &&);
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @param[in,out] tensor         Source tensor. Data types supported: All
      * @param[in]     constant_value Constant value to use to fill tensor.
      */
     void configure(ITensor *tensor, PixelValue constant_value);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_FILL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFillBorder.h b/arm_compute/runtime/NEON/functions/NEFillBorder.h
index 0ae04cbf00..44b1d4a62b 100644
--- a/arm_compute/runtime/NEON/functions/NEFillBorder.h
+++ b/arm_compute/runtime/NEON/functions/NEFillBorder.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,22 +24,33 @@
 #ifndef ARM_COMPUTE_NEFILLBORDER_H
 #define ARM_COMPUTE_NEFILLBORDER_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 // Forward declaration
 class ITensor;
+class NEFillBorderKernel;
 
 /** Basic function to run @ref NEFillBorderKernel */
 class NEFillBorder : public IFunction
 {
 public:
+    NEFillBorder();
     /** Initialize the function's source, destination and border_mode.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @note This function fills the borders within the XY-planes.
      *
      * @param[in, out] input                 Source tensor. Data type supported: All
@@ -47,13 +58,16 @@ public:
      * @param[in]      border_mode           Strategy to use for borders.
      * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *input,
+                   unsigned int      border_width,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NEFillBorderKernel _border_handler; /**< Kernel to handle image borders */
+    std::unique_ptr<NEFillBorderKernel> _border_handler; /**< Kernel to handle image borders */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEFILLBORDER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
index 7b4801cd1c..3e92143824 100644
--- a/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFlattenLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,18 +25,40 @@
 #define ARM_COMPUTE_NEFLATTENLAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to execute flatten layer kernel. */
-class NEFlattenLayer : public INESimpleFunctionNoBorder
+class NEFlattenLayer : public IFunction
 {
 public:
+    NEFlattenLayer();
+    /** Destructor */
+    ~NEFlattenLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFlattenLayer(const NEFlattenLayer &) = delete;
+    /** Default move constructor */
+    NEFlattenLayer(NEFlattenLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFlattenLayer &operator=(const NEFlattenLayer &) = delete;
+    /** Default move assignment operator */
+    NEFlattenLayer &operator=(NEFlattenLayer &&);
     /** Initialise the kernel's input and output.
      *
+     *  Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  First input tensor to flatten with at least 3 dimensions. The dimensions over the third will be interpreted as batches. Data types supported: All
      * @param[out] output Output tensor with shape [w*h*d, input_batches] where:
      *             w = width input tensor, h = height input tensor and d = depth input tensor. Data type supported: same as @p input
@@ -53,6 +75,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/runtime/NEON/functions/NEFloor.h b/arm_compute/runtime/NEON/functions/NEFloor.h
index a11907ba6c..77ac484bab 100644
--- a/arm_compute/runtime/NEON/functions/NEFloor.h
+++ b/arm_compute/runtime/NEON/functions/NEFloor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,44 @@
 #ifndef ARM_COMPUTE_NEFLOOR_H
 #define ARM_COMPUTE_NEFLOOR_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEFloorKernel */
-class NEFloor : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::kernels::CpuFloorKernel */
+class NEFloor : public IFunction
 {
 public:
+    /** Constructor */
+    NEFloor();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFloor(const NEFloor &) = delete;
+    /** Default move constructor */
+    NEFloor(NEFloor &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEFloor &operator=(const NEFloor &) = delete;
+    /** Default move assignment operator */
+    NEFloor &operator=(NEFloor &&);
+    /** Destructor */
+    ~NEFloor();
     /** Set the source, destination of the kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |F32    |F32    |
+     * |F16    |F16    |
+     *
      * @param[in]  input  Source tensor. Data type supported: F16/F32.
      * @param[out] output Destination tensor. Same as @p input
      */
@@ -50,6 +74,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEFLOOR_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index b14650c0e9..885f8430cf 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,46 +24,20 @@
 #ifndef ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H
 #define ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H
 
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 #include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/IWeightsManager.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
-/** Basic function to reshape the weights of Fully Connected layer with NEON. This function calls the following kernels:
- *
- *  -# @ref NETransposeKernel
- *
- * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
- */
-class NEFullyConnectedLayerReshapeWeights : public INESimpleFunctionNoBorder
-{
-public:
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input  Weights tensor. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[out] output Destination tensor. Data type supported: Same as @p input.
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayerReshapeWeights
-     *
-     * @param[in] input  Weights tensor info. The weights must be 2 dimensional. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: Same as @p input.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-
 namespace weights_transformations
 {
-/** Basic function to manage the reshape weights generated from @ref NEFullyConnectedLayerReshapeWeights */
+/** Basic function to manage the reshape weights generated from @ref NETranspose */
 class NEFullyConnectedLayerReshapeWeightsManaged : public ITransformWeights
 {
 public:
@@ -95,17 +69,17 @@ public:
     }
 
 private:
-    static constexpr uint32_t           _uid = 0x0;
-    Tensor                              _output{};
-    NEFullyConnectedLayerReshapeWeights _func{};
+    static constexpr uint32_t _uid = 0x0;
+    Tensor                    _output{};
+    NETranspose               _func{};
 };
 } // namespace weights_transformations
 
-/** Basic function to compute a Fully Connected layer on NEON. This function calls the following NEON kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
- *  -# @ref NEFullyConnectedLayerReshapeWeights (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
- *  -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- *  -# @ref NEGEMMMatrixAdditionKernel or @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is not equal to nullptr)
+/** Basic function to compute a Fully Connected layer. This function calls the following kernels:
+ *  -# @ref cpu::kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ *  -# @ref NEGEMM or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *  -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -113,77 +87,91 @@ class NEFullyConnectedLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                          IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer(const NEFullyConnectedLayer &) = delete;
-    /** Default move constructor */
-    NEFullyConnectedLayer(NEFullyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEFullyConnectedLayer(NEFullyConnectedLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEFullyConnectedLayer &operator=(const NEFullyConnectedLayer &) = delete;
-    /** Default move assignment operator */
-    NEFullyConnectedLayer &operator=(NEFullyConnectedLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEFullyConnectedLayer &operator=(NEFullyConnectedLayer &&) = delete;
+    /** Default destructor */
+    ~NEFullyConnectedLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input   Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
-     *                     If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                     If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
-     *                     - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                     - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                     Data type supported: Same as @p input.
-     * @param[in]  fc_info (Optional) Fully connected layer additional info
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  input        Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights      Weights tensor. The weights must be 2 dimensional.
+     *                          If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+     *                          If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+     *                          Data type supported: Same as @p input.
+     * @param[in]  biases       Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output       Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
+     *                          - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+     *                          - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+     *                          Data type supported: Same as @p input.
+     * @param[in]  fc_info      (Optional) Fully connected layer additional info
+     * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    void configure(const ITensor          *input,
+                   const ITensor          *weights,
+                   const ITensor          *biases,
+                   ITensor                *output,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEFullyConnectedLayer
      *
-     * @param[in] input   Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights Weights tensor info. The weights must be 2 dimensional.
-     *                    If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
-     *                    If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
-     *                    Data type supported: Same as @p input.
-     * @param[in] biases  Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor info. Its shape should be equal to the output of a matrix multiplication between:
-     *                    - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
-     *                    - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
-     *                    Data type supported: Same as @p input.
-     * @param[in] fc_info (Optional) Fully connected layer additional info
+     * Similar to @ref NEFullyConnectedLayer::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *output,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
+
+    /** Static function that queries whether fixed-format kernel exists for a given problem description
+     *
+     * @param[out] expected_weight_format Format in which weights should be for found fixed format kernel
+     * @param[in]  input                  Source tensor
+     * @param[in]  weights                Weights tensor.
+     * @param[in]  biases                 Bias tensor. Can be nullptr. Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  output                 Destination tensor
+     * @param[in]  fc_info                Fully connected layer additional info
+     * @param[in]  weights_info           Describes weights shape
+     *
+     * @return a status
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                               const ITensorInfo             *input,
+                               const ITensorInfo             *weights,
+                               const ITensorInfo             *biases,
+                               const ITensorInfo             *output,
+                               const FullyConnectedLayerInfo &fc_info,
+                               const WeightsInfo             &weights_info);
 
     //Inherited methods override
     void run() override;
     void prepare() override;
 
 private:
-    void configure_fc_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-    void configure_conv_fc(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-    void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act);
-
-    MemoryGroup                                                         _memory_group;
-    IWeightsManager                                                    *_weights_manager;
-    NEFlattenLayerKernel                                                _flatten_kernel;
-    NEConvertFullyConnectedWeights                                      _convert_weights;
-    weights_transformations::NEConvertFullyConnectedWeightsManaged      _convert_weights_managed;
-    NEFullyConnectedLayerReshapeWeights                                 _reshape_weights_function;
-    weights_transformations::NEFullyConnectedLayerReshapeWeightsManaged _reshape_weights_managed_function;
-    NEGEMM                                                              _mm_gemm;
-    NEGEMMLowpMatrixMultiplyCore                                        _mm_gemmlowp;
-    Tensor                                                              _flatten_output;
-    Tensor                                                              _converted_weights_output;
-    Tensor                                                              _reshape_weights_output;
-    const ITensor                                                      *_original_weights;
-    bool                                                                _are_weights_converted;
-    bool                                                                _are_weights_reshaped;
-    bool                                                                _is_fc_after_conv;
-    bool                                                                _is_quantized_asymmetric;
-    bool                                                                _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEFULLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
index b3b41c5445..f53b3de7f6 100644
--- a/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
+++ b/arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_NEFUSEBATCHNORMALIZATION_H
 
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
@@ -33,6 +32,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class NEFuseBatchNormalizationKernel;
 
 /** Basic function to fuse the batch normalization node to a preceding convolution node */
 class NEFuseBatchNormalization : public IFunction
@@ -49,9 +49,19 @@ public:
     /** Allow instances of this class to be moved */
     NEFuseBatchNormalization &operator=(NEFuseBatchNormalization &&) = default;
     /** Default destructor */
-    ~NEFuseBatchNormalization() = default;
+    ~NEFuseBatchNormalization();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F32            |F32            |
+     * |F16            |F16            |
+     *
      * @param[in]  input_weights Input weights tensor for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
      * @param[in]  bn_mean       Batch normalization layer mean tensor. Same as @p input_weights
      * @param[in]  bn_var        Batch normalization layer variance tensor. Same as @p input_weights
@@ -65,9 +75,16 @@ public:
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to Convolution.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalization
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -85,16 +102,22 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NEFuseBatchNormalizationKernel _fuse_bn_kernel;
+    std::unique_ptr<NEFuseBatchNormalizationKernel> _fuse_bn_kernel;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEFUSEBATCHNORMALIZATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 8dc6b88bb0..29650a5eca 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,18 @@
 #ifndef ARM_COMPUTE_NEGEMM_H
 #define ARM_COMPUTE_NEGEMM_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
 
 namespace arm_compute
 {
-/** Basic function to execute GEMM on NEON. This function calls the following NEON kernels:
- *
- * If optimized assembly is available:
- *  -# @ref NEGEMMAssemblyDispatch
- *  -# @ref NEActivationLayer (if alpha != 1.0)
- * Else:
- *  -# @ref NEGEMMInterleave4x4Kernel (if the output tensor is a matrix)
- *  -# @ref NEGEMMTranspose1xWKernel (if the output tensor is a matrix)
- *  -# @ref NEGEMMMatrixMultiplyKernel
- * In both cases:
- *  -# @ref NEGEMMMatrixAdditionKernel (if c != nullptr and beta != 0.0 and is not reshaped once)
- * Else:
- *  -# @ref NEArithmeticAdditionKernel (if c != nullptr and is reshaped once and not optimized assembly in place)
+/** Basic function to execute GEMM. This function calls the following kernels:
  *
- *  -# @ref NEActivationLayer (if activation is specified in GEMMInfo)
+ *  -# @ref cpu::CpuGemm
  */
 class NEGEMM : public IFunction
 {
@@ -69,11 +50,25 @@ public:
     NEGEMM &operator=(const NEGEMM &) = delete;
     /** Default move assignment operator */
     NEGEMM &operator=(NEGEMM &&) = default;
+    /** Default destructor */
+    ~NEGEMM();
     /** Initialise the kernel's inputs, output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     * |BFLOAT16     |BFLOAT16    |BFLOAT16  |BFLOAT16       |
+     *
      * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
      * @note GEMM: The tensors a, b, c, d must have the same data type. You should not mix data types when calling this function.
      *
+     * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+     *
      * @param[in]  a         First input tensor  (Matrix A or Vector A). Data type supported: BFLOAT16/F16/F32
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
      * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a
@@ -83,49 +78,49 @@ public:
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensor  *a,
+                   const ITensor  *b,
+                   const ITensor  *c,
+                   ITensor        *d,
+                   float           alpha,
+                   float           beta,
+                   const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMM.
      *
-     * @param[in]  a         First input tensor info  (Matrix or Vector A). Data types supported: BFLOAT16/F16/F32
-     * @param[in]  b         Second input tensor info (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor info. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     * Similar to @ref NEGEMM::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
+     * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
+     * as in @ref NEGEMM::validate() except that all arguments are required.
+     *
+     * @return a status
+     */
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *output,
+                               float                      alpha,
+                               float                      beta,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                _memory_group;
-    IWeightsManager           *_weights_manager;
-    NEGEMMInterleave4x4Kernel  _interleave_kernel;
-    NEGEMMTranspose1xWKernel   _transpose_kernel;
-    NEGEMMMatrixMultiplyKernel _mm_kernel;
-    NEGEMMAssemblyDispatch     _asm_glue;
-    NEGEMMMatrixAdditionKernel _ma_kernel;
-    NEActivationLayer          _alpha_scale_func;
-    NEArithmeticAdditionKernel _add_bias_kernel;
-    NEActivationLayer          _activation_func;
-
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _tmp_d;
-    const ITensor *_original_b;
-    bool           _run_vector_matrix_multiplication;
-    bool           _run_alpha_scale;
-    bool           _run_addition;
-    bool           _run_bias_addition;
-    bool           _run_activation;
-    bool           _reshape_b_only_on_first_run;
-    bool           _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
deleted file mode 100644
index ae0ae440d8..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H
-
-#include "arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include "arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp"
-
-namespace arm_compute
-{
-/** Assembly kernel glue */
-class NEGEMMAssemblyDispatch : public IFunction
-{
-public:
-    /** Constructor */
-    NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
-    /** Prevent instances of this class from being copy constructed */
-    NEGEMMAssemblyDispatch(const NEGEMMAssemblyDispatch &) = delete;
-    /** Prevent instances of this class from being copied */
-    NEGEMMAssemblyDispatch &operator=(const NEGEMMAssemblyDispatch &) = delete;
-    NEGEMMAssemblyDispatch(NEGEMMAssemblyDispatch &&)                 = default;
-    NEGEMMAssemblyDispatch &operator=(NEGEMMAssemblyDispatch &&) = default;
-    ~NEGEMMAssemblyDispatch()                                    = default;
-
-    class IFallback
-    {
-    public:
-        virtual void run()                 = 0;
-        virtual void prepare()             = 0;
-        virtual bool is_configured() const = 0;
-        virtual ~IFallback()               = default;
-    };
-
-private:
-    /** Interface for the arm_gemm fallback */
-    std::unique_ptr<IFallback> _arm_gemm;
-    MemoryGroup                _memory_group;    /**< Function memory group */
-    IWeightsManager           *_weights_manager; /**< Pointer to the weights manager */
-public:
-    /** If supported create an ACL function else fallback to the arm_gemm function.
-     *
-     * @param[in]  a         Input tensor (Matrix A)
-     * @param[in]  b         Input tensor (Matrix B)
-     * @param[in]  c         Input tensor (Matrix C) used to pass the bias for quantized calculations
-     * @param[out] d         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in]  gemm_info GEMM meta-data
-     */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info);
-
-    /** Indicates whether or not this function can be used to process the given parameters.
-     *
-     * @param[in] a         Input tensor info (Matrix A)
-     * @param[in] b         Input tensor info (Matrix B)
-     * @param[in] c         Input tensor info (Matrix C) used to pass the bias for quantized calculations
-     * @param[in] d         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
-     * @param[in] gemm_info GEMM meta-data
-     *
-     * @return a status.
-     */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info);
-    /** Checks if activation is supported by the gemm assembly dispatcher
-     *
-     * @param[in] activation Activation to check
-     *
-     * @return True if activation is supported else false
-     */
-    static bool is_activation_supported(const ActivationLayerInfo &activation);
-    /** Was the function successfully configured ?
-     *
-     * @return True if the function is configured and ready to run
-     */
-    bool is_configured() const;
-    // Inherited methods overridden:
-    /** Runs a preparation step, usually for pre-transposing matrix b */
-    void prepare() override;
-    void run() override;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
new file mode 100644
index 0000000000..d1c5a1c9b3
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMCONV2D_H
+#define ARM_COMPUTE_NEGEMMCONV2D_H
+
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
+ *
+ * Supports only NHWC data layout
+ *
+ * -# @ref cpu::CpuGemmAssemblyDispatch
+ * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch
+ *
+ * Weights are transformed from OHWI to HWIO format using the following kernels:
+ * -# @ref NEPermute
+ */
+class NEGEMMConv2d : public IFunction
+{
+public:
+    /** Constructor */
+    NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMConv2d(const NEGEMMConv2d &) = delete;
+    /** Default move constructor */
+    NEGEMMConv2d(NEGEMMConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMConv2d &operator=(const NEGEMMConv2d &) = delete;
+    /** Default move assignment operator */
+    NEGEMMConv2d &operator=(NEGEMMConv2d &&) = default;
+    /** Destructor */
+    ~NEGEMMConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2           |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |S32            |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32            |QASYMM8_SIGNED |
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     * |BFLOAT16       |BFLOAT16       |BFLOAT16       |BFLOAT16       |
+     *
+     * @param[in]  input   Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                     while every optional dimension from 4 and above represent a batch of inputs.
+     *                     Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                     Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] output  Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                     Data types supported: Same as @p input.
+     * @param[in]  info    Convolution layer descriptor
+     */
+    void
+    configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
+     *
+     * @param[in] input   Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                    while every optional dimension from 4 and above represent a batch of inputs.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases  Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                    Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] output  Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                    Data types supported: Same as @p input.
+     * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &info);
+
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMCONV2D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e7da1006e0..3e84c3e2cf 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,254 +24,191 @@
 #ifndef ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 #define ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Function to reshape the weights. This function calls the following kernel:
- * -# @ref NEWeightsReshapeKernel
- */
-class NEConvolutionLayerReshapeWeights : public IFunction
-{
-public:
-    /** Constructor */
-    NEConvolutionLayerReshapeWeights();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete;
-    /** Default move constructor */
-    NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete;
-    /** Default move assignment operator */
-    NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                     Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
-     * @param[out] output  Destination tensor.
-     *                     Data types supported: Same as @p weights, FP32 if @p weights is BFLOAT16
-     */
-    void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
-     *
-     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                    Data type supported: Same as @p weights, S32 if @p weights is QASYMM8/QASYMM8_SIGNED, FP32 if @p weights is BFLOAT16
-     * @param[in] output  Destination tensor.
-     *                    Data types supported: Same as @p weights FP32 if @p weights is BFLOAT16
-     *
-     * @return an error status
-     */
-    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEWeightsReshapeKernel _weights_reshape_kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref NEConvolutionLayerReshapeWeights */
-class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights
-{
-public:
-    void configure(const ITensor *input, const ITensor *biases)
-    {
-        _bias_bit = (biases != nullptr) ? 1 : 0;
-        _func.configure(input, biases, &_output);
-    }
-
-    void run() override
-    {
-        _output.allocator()->allocate();
-        _func.run();
-        _reshape_run = true;
-    }
-
-    ITensor *get_weights() override
-    {
-        return &_output;
-    }
-
-    void release() override
-    {
-        _output.allocator()->free();
-    }
-
-    uint32_t uid() override
-    {
-        return ((0x8) | (_bias_bit << 7));
-    }
-
-    bool is_reshape_run()
-    {
-        return _reshape_run;
-    }
-
-private:
-    Tensor                           _output{};
-    NEConvolutionLayerReshapeWeights _func{};
-    int32_t                          _bias_bit{ 0 };
-};
-} // namespace weights_transformations
-
-/** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions:
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
- * -# @ref NEIm2ColKernel
- * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEArithmeticAdditionKernel (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
- * -# @ref NECol2ImKernel (if NCHW data layout)
+ * -# @ref cpu::CpuGemmConv2d
  *
  */
 class NEGEMMConvolutionLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager  = nullptr,
+                           IWeightsManager                       *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer(const NEGEMMConvolutionLayer &) = delete;
-    /** Default move constructor */
-    NEGEMMConvolutionLayer(NEGEMMConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMConvolutionLayer(NEGEMMConvolutionLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMConvolutionLayer &operator=(const NEGEMMConvolutionLayer &) = delete;
-    /** Default move assignment operator */
-    NEGEMMConvolutionLayer &operator=(NEGEMMConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMConvolutionLayer &operator=(NEGEMMConvolutionLayer &&) = delete;
+    /** Default destructor */
+    ~NEGEMMConvolutionLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                          Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |BFLOAT16       |BFLOAT16           |BFLOAT16 |BFLOAT16       |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     *
+     * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                              Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p input.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
-     * @param[in] input        Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights      Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                         Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases       Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                         Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output       Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                         Data types supported: Same as @p input.
-     * @param[in] conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                         tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in] dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * @param[in] input            Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] output           Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                             tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    /** Configures the appropriate matrix multiply routine
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
+
+    /** Static function to check if there is an optimized version of
+     * GEMM available for the input parameters.
      *
-     * @param[in]  input         Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in]  weights       Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in]  biases        Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                           Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[out] output        Output tensor. Data types supported: Same as @p input,
-     *                           except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
-     * @param[in]  act_info      (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in]  gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
-     */
-    void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+     * The method is intended to be used to find out the optimal
+     * memory layout to be used for the weights tensor when running
+     * variable weights execution.
      *
-     * @param[in] input         Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights       Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
-     * @param[in] biases        Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
-     * @param[in] output        Output tensor info. Data types supported: Same as @p input,
-     *                          except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
-     * @param[in] act_info      (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
-     * @param[in] skip_im2col   (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+     * The user can query the database of optimised kernels in
+     * arm_gemm by specifying one of the enumerations of
+     * arm_compute::WeightFormat in the weight_format field of the input
+     * parameter weights_info. In case of success, the method
+     * writes the expected format in the output parameter
+     * expected_weight_format. The expected_weight_format can than be
+     * used in the configure method of the class for retrieving the
+     * best optimal kernel.
      *
-     * @return a status
-     */
-    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                              int gemm_3d_depth = 1, bool skip_im2col = false);
-    /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore
+     * Use case one - query for a specific format:
      *
-     * @param[in] input_info    Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] weights_info  Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
-     * @param[in] act_info      Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
-     * @param[in] gemm_3d_depth Depth of GEMM 3D
-     * @param[in] skip_im2col   Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+     *     WeightInfo weights_info(..., arm_compute::WeightFormat::OHWIo4, ...); // Set the value of the input query.
+     *     if (NEGEMMConvolutionlayer::has_opt_impl(WeightFormat(), ...., weights_info, ...))
+     *     {
+     *       auto conv = std::unique_ptr<NEGEMMConvolutionlayer>();
+     *       conv->configure(..., weights_info, ...);  // uses the same WeightFormat the user wanted originally, OHWYo4.
+     *       conv->run(...);
+     *     }
      *
-     * @return a status
+     * Use case two - query for any format that would be optimal for the GEMM to execute:
+     *
+     *     WeightInfo weights_info(..., arm_compute::WeightFormat::ANY, ...); // Set the value of the input query.
+     *     arm_compute::WeightFormat expected_wf;
+     *     if (NEGEMMConvolutionlayer::has_opt_impl(expected_wf, ...., weights_info, ...))
+     *     {
+     *       auto conv = std::unique_ptr<NEGEMMConvolutionlayer>();
+     *       // ... code to convert the layout of the weights tensor to the layout returned by has_opt_impl
+     *       WeightInfo new_weights_info(..., expected_wf, ...); // Set the value of the WeightFormat returned by has_opt_impl.
+     *       conv->configure(..., new_weights_info, ...);
+     *       conv->run(...);
+     *     }
+     *
+     * Notice that a GEMM configured with a WeightFormat other than
+     * UNSPECIFIED will run GEMM with variable weights mode.
+     *
+     * @param[out] expected_weight_format The arm_compute::WeightFormat expected by the kernel.
+     * @param[in]  src                    Source tensor info.
+     * @param[in]  weights                Weights tensor info.
+     * @param[in]  biases                 Biases tensor info. Shared biases supported.
+     * @param[in]  dst                    Destination tensor info.
+     * @param[in]  conv_info              Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  weights_info           (optional) Specifies additional configuration parameters for the weights of the GEMM computation.
+     * @param[in]  dilation               (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  act_info               (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported. And no activation (i.e. Linear) which is the default value.
+     * @param[in]  enable_fast_math       (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *
+     * @return a Status
      */
-    static Status validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               bool                       enable_fast_math = false);
+    // Inherited methods overridden:
+    void run() override;
+    void prepare() override;
 
 private:
-    MemoryGroup                                                        _memory_group;
-    IWeightsManager                                                   *_weights_manager;
-    NEConvolutionLayerReshapeWeights                                   _reshape_weights;
-    weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
-    NEIm2ColKernel                                                     _im2col_kernel;
-    NEGEMM                                                             _mm_gemm;
-    NEGEMMLowpMatrixMultiplyCore                                       _mm_gemmlowp;
-    NECol2ImKernel                                                     _col2im_kernel;
-    NEReshapeLayer                                                     _reshape_layer;
-
-    const ITensor *_original_weights;
-
-    Tensor _im2col_output;
-    Tensor _weights_reshaped;
-    Tensor _gemm_output;
-    Tensor _tmp_output;
-
-    DataLayout _data_layout;
-
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _is_prepared;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H */
+#endif /* ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
deleted file mode 100644
index a8ce1e511b..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to execute matrix multiply assembly kernels. */
-class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction
-{
-public:
-    /** Constructor */
-    NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  a      First input tensor  (Matrix A). Data type supported: U8, S8.
-     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[in]  c      Third input tensor (Matrix C). Data type supported: same as @p a
-     * @param[out] output Output tensor. Data type supported: Data type supported: U32, S32
-     */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                _memory_group;
-    NEGEMMAssemblyDispatch     _asm_glue;
-    std::unique_ptr<INEKernel> _mm_kernel;
-    std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
-    std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
-    Tensor                     _tmp_a;
-    Tensor                     _tmp_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 11683c5b95..6d07675d3d 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021, 2023-2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,49 +21,34 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "NEActivationLayer.h"
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/GEMMInfo.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/IWeightsManager.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following NEON kernels if the DOT product instruction is not available:
+/** Function to run Gemm on quantized types.
  *
- *  -# @ref NEGEMMInterleave4x4Kernel
- *  -# @ref NEGEMMTranspose1xWKernel
- *  -# @ref NEGEMMLowpMatrixMultiplyKernel
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *  -# @ref NEActivationLayer
+ *  This function calls the following:
  *
- * otherwise if the DOT product instruction is available:
- *
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *
-*/
+ * -# @ref cpu::CpuGemmLowpMatrixMultiplyCore
+ */
 class NEGEMMLowpMatrixMultiplyCore : public IFunction
 {
 public:
     /** Constructor */
-    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager  = nullptr,
+                                 IWeightsManager                *weights_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGEMMLowpMatrixMultiplyCore(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move constructor */
@@ -72,8 +57,31 @@ public:
     NEGEMMLowpMatrixMultiplyCore &operator=(const NEGEMMLowpMatrixMultiplyCore &) = delete;
     /** Default move assignment operator */
     NEGEMMLowpMatrixMultiplyCore &operator=(NEGEMMLowpMatrixMultiplyCore &&) = default;
+    /** Default destructor */
+    ~NEGEMMLowpMatrixMultiplyCore();
     /** Initialise the kernel's inputs, output
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |F32      |F32            |
+     *
      * @note GEMM_LOWP:  low precision GEMM kernel
      *  This kernel performs the following computations:
      *
@@ -81,69 +89,36 @@ public:
      *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
      *  -# Compute the matrix product of the resulting a * b in int32.
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
+     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED/F32 otherwise
      *
      * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
      * @param[in]  b         Second input tensor (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[out] output    Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
+     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32/F32
+     * @param[out] output    Output tensor. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED/F32
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(
+        const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
-     *
-     * @param[in] a         First input tensor info  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[in] output    Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                      if the reshape of matrix B should be executed only for the first run
+     * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                                   _memory_group;
-    IWeightsManager                              *_weights_manager;
-    NEGEMMAssemblyDispatch                        _asm_glue;
-    NEGEMMLowpMatrixMultiplyKernel                _mm_kernel;
-    NEGEMMInterleave4x4Kernel                     _mtx_a_reshape_kernel;
-    NEGEMMTranspose1xWKernel                      _mtx_b_reshape_kernel;
-    NEGEMMLowpMatrixAReductionKernel              _mtx_a_reduction_kernel;
-    NEGEMMLowpMatrixBReductionKernel              _mtx_b_reduction_kernel;
-    NEGEMMLowpOffsetContributionKernel            _offset_contribution_kernel;
-    NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-    NEActivationLayer                             _activation_func;
-    NEConvertQuantizedSignednessKernel            _convert_to_signed_asymm;
-    NEConvertQuantizedSignednessKernel            _convert_from_signed_asymm;
-
-    Tensor         _vector_sum_col;
-    Tensor         _vector_sum_row;
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _mm_result_s32;
-    Tensor         _signed_a;
-    Tensor         _signed_output;
-    const ITensor *_original_b;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-
-    bool _run_vector_matrix_multiplication;
-    bool _assembly_path;
-    bool _fused_assembly_path;
-    bool _reshape_b_only_on_first_run;
-    bool _is_prepared;
-    bool _fuse_output_stage;
-    bool _run_activation;
-    bool _flip_signedness;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index cbdc788c0a..0d932bb4af 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
 #define ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 
-/** This file contains all available output stages for GEMMLowp on NEON.
+/** This file contains all available output stages for GEMMLowp.
  *
  *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref NEGEMMLowpMatrixMultiplyCore),
  *  and processes it to obtain the final ASYMM8 value.
@@ -37,253 +38,39 @@
 namespace arm_compute
 {
 class ITensor;
-
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8Scale on NEON.
- *
- *  NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
- *  The final result is:
- *
- *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
- *
- * In case the bias tensor is provided, the final result is:
- *
- *  ((input[i][k] + bias[k] + result_offset) * result_mult_int) >> result_shift
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input           Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_offset   Offset to be added to each element of the input matrix
-     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale
-     *
-     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.05)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint on NEON.
- *
- *  NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint on NEON.
- *
- *  NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint depends on 3 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift, result_offset_after_shift
- *
- * The final result is:
+class ITensorInfo;
+/** Basic function to execute GEMMLowpQuantizeDown kernels.
  *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift) + result_offset_after_shift
+ *  This function calls the following operators:
  *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
-*/
-class NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  result_offset_after_shift    Offset to be applied to result before converting it back to QASYMM8_SIGNED
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                   int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8_SIGNED
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8_SIGNED. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-/** Basic function to execute NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint on NEON.
- *
- *  NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint depends on 2 parameters:
- *
- *  result_fixedpoint_multiplier, result_shift
- *
- * The final result is:
- *
- * (FixedPointMul(input[i][k], result_fixedpoint_multiplier) >> result_shift)
- *
- * where FixedPointMul(x, y) is the nearest integer to the following
- * mathematical expression, evaluated without overflow or intermediate rounding:
- *
- * (x * y) / 2^31
- *
- * For more information: https://github.com/google/gemmlowp/blob/master/public/output_stages.h#L68
- *
- * In case the bias tensor is provided, the final result is:
- *
- * ((FixedPointMul(input[i][k] + bias[k], result_fixedpoint_multiplier)) >> result_shift) + result_offset_after_shift
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
- *
- * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
- *       after the result is shifted right by result_shift
+ * -# @ref cpu::CpuGemmLowpOutputStage
 */
-class NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint : public INESimpleFunctionNoBorder
+class NEGEMMLowpOutputStage : public IFunction
 {
 public:
+    /** Constructor */
+    NEGEMMLowpOutputStage();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpOutputStage(const NEGEMMLowpOutputStage &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEGEMMLowpOutputStage &operator=(const NEGEMMLowpOutputStage &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpOutputStage(NEGEMMLowpOutputStage &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEGEMMLowpOutputStage &operator=(NEGEMMLowpOutputStage &&) = delete;
+    /** Default destructor */
+    ~NEGEMMLowpOutputStage();
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input                        Input tensor. Data type supported: S32
-     * @param[in]  bias                         Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                                          Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output                       Output tensor. Data type supported: Data type supported: QSYMM16
-     * @param[in]  result_fixedpoint_multiplier Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
-     * @param[in]  result_shift                 Number of bits to shift right the result after the fixed point multiplication
-     * @param[in]  min                          (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
-     *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
-     */
-    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, int min = std::numeric_limits<int32_t>::lowest(),
-                   int max = std::numeric_limits<int32_t>::max());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
-     *
-     * @param[in] input  Input tensor info. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias   Biases tensor info. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output Output tensor info. Data type supported: Data type supported: QSYMM16
-     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QSYMM16. Defaults to the minimum possible 32-bit signed integer.
-     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QSYMM16,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to the maximum possible 32-bit signed integer.
+     * Valid data layouts:
+     * - All
      *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = std::numeric_limits<int32_t>::lowest(), int max = std::numeric_limits<int32_t>::max());
-};
-
-/** Basic function to execute GEMMLowpQuantizeDown kernels on NEON.
- *
- *  This function calls the following NEON kernels:
- *
- * -# @ref NEGEMMLowpQuantizeDownInt32ScaleKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
-*/
-class NEGEMMLowpOutputStage : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
      *
      * @param[in]  input  Input tensor. Data type supported: S32
      * @param[in]  bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
@@ -302,7 +89,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *output,
+                           const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h b/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
deleted file mode 100644
index 6a38490ed4..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H
-#define ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to execute NEGEMMTranspose1xWKernel. This function calls the following NEON kernels:
- *
- *  -# @ref NEGEMMTranspose1xWKernel
- *
- */
-class NEGEMMTranspose1xW : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output
-     *
-     * @param[in]  input  First input tensor. Data type supported: All
-     * @param[out] output Output tensor. Data type supported: same as @p input
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xW
-     *
-     * @param[in] input  First input tensor. Data type supported: All
-     * @param[in] output Output tensor. Data type supported: same as @p input
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMTRANSPOSE1XW_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGather.h b/arm_compute/runtime/NEON/functions/NEGather.h
index 7ed45c0f15..9c7ae0134d 100644
--- a/arm_compute/runtime/NEON/functions/NEGather.h
+++ b/arm_compute/runtime/NEON/functions/NEGather.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,6 +32,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEGatherKernel */
 class NEGather : public INESimpleFunctionNoBorder
@@ -39,19 +40,26 @@ class NEGather : public INESimpleFunctionNoBorder
 public:
     /** Initialise the kernel's inputs and outputs
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input   Source tensor. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in]  indices Indices tensor. Supported tensor rank: up to 1. Must be one of the following type: U32/S32. Each value Must be in range [0, input.shape[@p axis])
+     * @param[in]  indices Indices tensor. Supported tensor rank: up to 3. Must be one of the following type: U32/S32. Each value must be in range [0, input.shape[@p axis]), otherwise the result will become unpredictable.
+     *                     @note The "axis" must be in the range [0, input.rank -1] when indices is a vector, and must be 1 when indices is a 2D or 3D tensor.
      * @param[out] output  Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+     *
      */
     void configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis = 0);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGatherKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] input   Source tensor info. Supported tensor rank: up to 4. Data type supported: All
-     * @param[in] indices Indices tensor info. Supported tensor rank: up to 1. Must be one of the following types: U32/S32. Each value Must be in range [0, input.shape[@p axis])
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input
-     * @param[in] axis    (Optional) The axis in @p input to gather @p indices from. Defaults to 0
+     * Similar to @ref NEGather::configure()
      *
      * @return a status
      */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h b/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
deleted file mode 100644
index 45736664ea..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGaussian3x3.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN3x3_H
-#define ARM_COMPUTE_NEGAUSSIAN3x3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute gaussian filter 3x3. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEGaussian3x3Kernel
- *
- */
-class NEGaussian3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's input, output and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor, Data type supported: U8.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEGAUSSIAN3x3_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h b/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
deleted file mode 100644
index 847530169a..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGaussian5x5.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN5x5_H
-#define ARM_COMPUTE_NEGAUSSIAN5x5_H
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute gaussian filter 5x5. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEGaussian5x5HorKernel
- * -# @ref NEGaussian5x5VertKernel
- *
- */
-class NEGaussian5x5 : public IFunction
-{
-public:
-    /** Default constructor
-     */
-    NEGaussian5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's input, output and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor, Data type supported: U8.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup             _memory_group;   /**< Function memory group */
-    NEGaussian5x5HorKernel  _kernel_hor;     /**< kernel for horizontal pass */
-    NEGaussian5x5VertKernel _kernel_vert;    /**< kernel for vertical pass */
-    Tensor                  _tmp;            /**< temporary buffer for output of horizontal pass */
-    NEFillBorderKernel      _border_handler; /**< kernel to handle tensor borders */
-};
-}
-#endif /*ARM_COMPUTE_NEGAUSSIAN5x5_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h b/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
deleted file mode 100644
index a6b21278ba..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGaussianPyramid.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIANPYRAMID_H
-#define ARM_COMPUTE_NEGAUSSIANPYRAMID_H
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/Pyramid.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for all Gaussian pyramid functions */
-class NEGaussianPyramid : public IFunction
-{
-public:
-    /** Default constructor */
-    NEGaussianPyramid();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramid(const NEGaussianPyramid &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramid &operator=(const NEGaussianPyramid &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramid(NEGaussianPyramid &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramid &operator=(NEGaussianPyramid &&) = default;
-    /** Default destructor */
-    virtual ~NEGaussianPyramid() = default;
-
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[out] pyramid               Destination pyramid tensors, Data type supported at each level: U8.
-     * @param[in]  border_mode           Border mode to use.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    virtual void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) = 0;
-
-protected:
-    const ITensor *_input;
-    IPyramid      *_pyramid;
-    Pyramid        _tmp;
-};
-
-/** Basic function to execute gaussian pyramid with HALF scale factor. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEGaussianPyramidHorKernel
- * -# @ref NEGaussianPyramidVertKernel
- *
- */
-class NEGaussianPyramidHalf : public NEGaussianPyramid
-{
-public:
-    /** Constructor */
-    NEGaussianPyramidHalf();
-
-    // Inherited methods overridden:
-    void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void run() override;
-
-private:
-    std::vector<NEFillBorderKernel>          _horizontal_border_handler;
-    std::vector<NEFillBorderKernel>          _vertical_border_handler;
-    std::vector<NEGaussianPyramidHorKernel>  _horizontal_reduction;
-    std::vector<NEGaussianPyramidVertKernel> _vertical_reduction;
-};
-
-/** Basic function to execute gaussian pyramid with ORB scale factor. This function calls the following NEON kernels and functions:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEGaussian5x5
- * -# @ref NEScaleKernel
- *
- */
-class NEGaussianPyramidOrb : public NEGaussianPyramid
-{
-public:
-    /** Constructor */
-    NEGaussianPyramidOrb();
-
-    // Inherited methods overridden:
-    void configure(const ITensor *input, IPyramid *pyramid, BorderMode border_mode, uint8_t constant_border_value) override;
-    void run() override;
-
-private:
-    std::vector<NEGaussian5x5> _gaus5x5;
-    std::vector<NEScale>       _scale_nearest;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIANPYRAMID_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 7c470fbaf0..0f294fde22 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,34 +23,35 @@
  */
 #ifndef ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
 #define ARM_COMPUTE_NEGENERATEPROPOSALSLAYER_H
-#include "arm_compute/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "arm_compute/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPermuteKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
+
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
 class ITensor;
+class NEComputeAllAnchorsKernel;
 
 /** Basic function to generate proposals for a RPN (Region Proposal Network)
  *
- * This function calls the following Neon kernels:
- * -# @ref NEComputeAllAnchors
+ * This function calls the following Arm(R) Neon(TM) layers/kernels:
+ * -# @ref NEComputeAllAnchorsKernel
  * -# @ref NEPermute x 2
  * -# @ref NEReshapeLayer x 2
  * -# @ref NEBoundingBoxTransform
  * -# @ref NEPadLayerKernel
- * -# @ref NEDequantizationLayerKernel x 2
- * -# @ref NEQuantizationLayerKernel
+ * -# @ref NEDequantizationLayer x 2
+ * -# @ref NEQuantizationLayer
  * And the following CPP kernels:
  * -# @ref CPPBoxWithNonMaximaSuppressionLimit
  */
@@ -66,9 +67,21 @@ public:
     NEGenerateProposalsLayer(const NEGenerateProposalsLayer &) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEGenerateProposalsLayer &operator=(const NEGenerateProposalsLayer &) = delete;
+    /** Default destructor */
+    ~NEGenerateProposalsLayer();
 
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QSYMM8             |QSYMM16  |QASYMM8        |
+     *
      * @param[in]  scores              Scores from convolution layer of size (W, H, A), where H and W are the height and width of the feature map, and A is the number of anchors.
      *                                 Data types supported: QASYMM8/F16/F32
      * @param[in]  deltas              Bounding box deltas from convolution layer of size (W, H, 4*A). Data types supported: Same as @p scores
@@ -82,7 +95,12 @@ public:
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the @ref GenerateProposalsInfo struct.
      * @note Proposals contains all the proposals. Of those, only the first num_valid_proposals are valid.
      */
-    void configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+    void configure(const ITensor               *scores,
+                   const ITensor               *deltas,
+                   const ITensor               *anchors,
+                   ITensor                     *proposals,
+                   ITensor                     *scores_out,
+                   ITensor                     *num_valid_proposals,
                    const GenerateProposalsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEGenerateProposalsLayer
@@ -99,7 +117,11 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
+    static Status validate(const ITensorInfo           *scores,
+                           const ITensorInfo           *deltas,
+                           const ITensorInfo           *anchors,
+                           const ITensorInfo           *proposals,
+                           const ITensorInfo           *scores_out,
                            const ITensorInfo           *num_valid_proposals,
                            const GenerateProposalsInfo &info);
 
@@ -110,17 +132,17 @@ private:
     // Memory group manager
     MemoryGroup _memory_group;
 
-    // Neon kernels
-    NEPermuteKernel              _permute_deltas_kernel;
-    NEReshapeLayerKernel         _flatten_deltas_kernel;
-    NEPermuteKernel              _permute_scores_kernel;
-    NEReshapeLayerKernel         _flatten_scores_kernel;
-    NEComputeAllAnchorsKernel    _compute_anchors_kernel;
-    NEBoundingBoxTransformKernel _bounding_box_kernel;
-    NEPadLayerKernel             _pad_kernel;
-    NEDequantizationLayerKernel  _dequantize_anchors;
-    NEDequantizationLayerKernel  _dequantize_deltas;
-    NEQuantizationLayerKernel    _quantize_all_proposals;
+    // kernels/layers
+    NEPermute                                  _permute_deltas;
+    NEReshapeLayer                             _flatten_deltas;
+    NEPermute                                  _permute_scores;
+    NEReshapeLayer                             _flatten_scores;
+    std::unique_ptr<NEComputeAllAnchorsKernel> _compute_anchors;
+    NEBoundingBoxTransform                     _bounding_box;
+    NEPadLayer                                 _pad;
+    NEDequantizationLayer                      _dequantize_anchors;
+    NEDequantizationLayer                      _dequantize_deltas;
+    NEQuantizationLayer                        _quantize_all_proposals;
 
     // CPP functions
     CPPBoxWithNonMaximaSuppressionLimit _cpp_nms;
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h b/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
deleted file mode 100644
index f0f46ce3ee..0000000000
--- a/arm_compute/runtime/NEON/functions/NEHOGDescriptor.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDESCRIPTOR_H
-#define ARM_COMPUTE_NEHOGDESCRIPTOR_H
-
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class IHOG;
-/** Basic function to calculate HOG descriptor. This function calls the following NEON kernels:
- *
- * -# @ref NEHOGGradient
- * -# @ref NEHOGOrientationBinningKernel
- * -# @ref NEHOGBlockNormalizationKernel
- *
- */
-class NEHOGDescriptor : public IFunction
-{
-public:
-    /** Default constructor */
-    NEHOGDescriptor(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destination, HOG data-object and border mode
-     *
-     * @param[in, out] input                 Input tensor. Data type supported: U8
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Output tensor which stores the HOG descriptor. DataType supported: F32. The number of channels is equal to the number of histogram bins per block
-     * @param[in]      hog                   HOG data object which describes the HOG descriptor
-     * @param[in]      border_mode           Border mode to use.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const IHOG *hog, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                   _memory_group;
-    NEHOGGradient                 _gradient;
-    NEHOGOrientationBinningKernel _orient_bin;
-    NEHOGBlockNormalizationKernel _block_norm;
-    Tensor                        _mag;
-    Tensor                        _phase;
-    Tensor                        _hog_space;
-};
-}
-
-#endif /* ARM_COMPUTE_NEHOGDESCRIPTOR_H */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGDetector.h b/arm_compute/runtime/NEON/functions/NEHOGDetector.h
deleted file mode 100644
index c0bd3da468..0000000000
--- a/arm_compute/runtime/NEON/functions/NEHOGDetector.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDETECTOR_H
-#define ARM_COMPUTE_NEHOGDETECTOR_H
-
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-/** Basic function to execute HOG detector based on linear SVM. This function calls the following NEON kernel:
- *
- * -# @ref NEHOGDetectorKernel
- *
- */
-class NEHOGDetector : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's input, output, HOG data object, detection window stride, threshold and index class
-     *
-     * @attention The function does not reset the number of values in @ref IDetectionWindowArray so it is caller's responsibility to clear it.
-     *
-     * @param[in]  input                   Input tensor. It is the output of @ref NEHOGDescriptor. Data type supported: F32
-     * @param[in]  hog                     HOG data-object that describes the HOG descriptor
-     * @param[out] detection_windows       Array of @ref DetectionWindow used to store the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the block stride stored in hog
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, size_t idx_class = 0);
-};
-} // namespace arm_compute
-
-#endif /* ARM_COMPUTE_NEHOGDETECTOR_H */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGGradient.h b/arm_compute/runtime/NEON/functions/NEHOGGradient.h
deleted file mode 100644
index f8c3827049..0000000000
--- a/arm_compute/runtime/NEON/functions/NEHOGGradient.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGGRADIENT_H
-#define ARM_COMPUTE_NEHOGGRADIENT_H
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDerivative.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-/** Basic function to calculate the gradient for HOG. This function calls the following NEON kernels:
- *
- * -# @ref NEDerivative
- * -# NEMagnitudePhaseKernel
- *
- */
-class NEHOGGradient : public IFunction
-{
-public:
-    /** Default constructor */
-    NEHOGGradient(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations, phase type and border mode
-     *
-     * @param[in, out] input                 Input tensor. Data type supported: U8.
-     *                                       (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_magnitude      Output tensor (magnitude). Data type supported: U16.
-     * @param[out]     output_phase          Output tensor.(phase). Format supported: U8
-     * @param[in]      phase_type            Type of @ref PhaseType
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output_magnitude, ITensor *output_phase, PhaseType phase_type, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                _memory_group;
-    NEDerivative               _derivative;
-    std::unique_ptr<INEKernel> _mag_phase;
-    Tensor                     _gx;
-    Tensor                     _gy;
-};
-}
-#endif /*ARM_COMPUTE_NEHOGGRADIENT_H */
diff --git a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h b/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
deleted file mode 100644
index 3840b9c0ad..0000000000
--- a/arm_compute/runtime/NEON/functions/NEHOGMultiDetection.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGMULTIDETECTION_H
-#define ARM_COMPUTE_NEHOGMULTIDETECTION_H
-
-#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IMultiHOG.h"
-#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGDetector.h"
-#include "arm_compute/runtime/NEON/functions/NEHOGGradient.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic function to detect multiple objects (or the same object at different scales) on the same input image using HOG. This function calls the following NEON kernels:
- *
- * -# @ref NEHOGGradient
- * -# @ref NEHOGOrientationBinningKernel
- * -# @ref NEHOGBlockNormalizationKernel
- * -# @ref NEHOGDetector
- * -# @ref CPPDetectionWindowNonMaximaSuppressionKernel (executed if non_maxima_suppression == true)
- *
- * @note This implementation works if all the HOG data-objects within the IMultiHOG container have the same:
- *       -# Phase type
-         -# Normalization type
-         -# L2 hysteresis threshold if the normalization type is L2HYS_NORM
- *
- */
-class NEHOGMultiDetection : public IFunction
-{
-public:
-    /** Default constructor */
-    NEHOGMultiDetection(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGMultiDetection(const NEHOGMultiDetection &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGMultiDetection &operator=(const NEHOGMultiDetection &) = delete;
-    /** Initialise the function's source, destination, detection window strides, border mode, threshold and non-maxima suppression
-     *
-     * @param[in, out] input                    Input tensor. Data type supported: U8
-     *                                          (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      multi_hog                Container of multiple HOG data object. Each HOG data object describes one HOG model to detect.
-     *                                          This container should store the HOG data-objects in descending or ascending cell_size width order.
-     *                                          This will help to understand if the HOG descriptor computation can be skipped for some HOG data-objects
-     * @param[out]     detection_windows        Array of @ref DetectionWindow used for locating the detected objects
-     * @param[in]      detection_window_strides Array of @ref Size2D used to specify the distance in pixels between 2 consecutive detection windows in x and y directions for each HOG data-object
-     *                                          The dimension of this array must be the same of multi_hog->num_models()
-     *                                          The i-th detection_window_stride of this array must be multiple of the block_stride stored in the i-th multi_hog array
-     * @param[in]      border_mode              Border mode to use.
-     * @param[in]      constant_border_value    (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      threshold                (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]      non_maxima_suppression   (Optional) Flag to specify whether the non-maxima suppression is required or not.
-     *                                          True if the non-maxima suppression stage has to be computed
-     * @param[in]      min_distance             (Optional) Radial Euclidean distance to use for the non-maxima suppression stage
-     *
-     */
-    void configure(ITensor *input, const IMultiHOG *multi_hog, IDetectionWindowArray *detection_windows, const ISize2DArray *detection_window_strides, BorderMode border_mode,
-                   uint8_t constant_border_value = 0,
-                   float threshold = 0.0f, bool non_maxima_suppression = false, float min_distance = 1.0f);
-
-    // Inherited method overridden:
-    void run() override;
-
-private:
-    MemoryGroup                                  _memory_group;
-    NEHOGGradient                                _gradient_kernel;
-    std::vector<NEHOGOrientationBinningKernel>   _orient_bin_kernel;
-    std::vector<NEHOGBlockNormalizationKernel>   _block_norm_kernel;
-    std::vector<NEHOGDetector>                   _hog_detect_kernel;
-    CPPDetectionWindowNonMaximaSuppressionKernel _non_maxima_kernel;
-    std::vector<Tensor>                          _hog_space;
-    std::vector<Tensor>                          _hog_norm_space;
-    IDetectionWindowArray                       *_detection_windows;
-    Tensor                                       _mag;
-    Tensor                                       _phase;
-    bool                                         _non_maxima_suppression;
-    size_t                                       _num_orient_bin_kernel;
-    size_t                                       _num_block_norm_kernel;
-    size_t                                       _num_hog_detect_kernel;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGMULTIDETECTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h b/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
deleted file mode 100644
index caf887d492..0000000000
--- a/arm_compute/runtime/NEON/functions/NEHarrisCorners.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHARRISCORNERS_H
-#define ARM_COMPUTE_NEHARRISCORNERS_H
-
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Basic function to execute harris corners detection. This function calls the following NEON kernels and functions:
- *
- * -# @ref NESobel3x3 (if gradient_size == 3) or<br/>
- *    @ref NESobel5x5 (if gradient_size == 5) or<br/>
- *    @ref NESobel7x7 (if gradient_size == 7)
- * -# @ref NEFillBorderKernel
- * -# NEHarrisScoreKernel<3> (if block_size == 3) or<br/>
- *    NEHarrisScoreKernel<5> (if block_size == 5) or<br/>
- *    NEHarrisScoreKernel<7> (if block_size == 7)
- * -# @ref NENonMaximaSuppression3x3
- * -# @ref CPPCornerCandidatesKernel
- * -# @ref CPPSortEuclideanDistanceKernel
- *
- */
-class NEHarrisCorners : public IFunction
-{
-public:
-    /** Constructor
-     *
-     * Initialize _sobel, _harris_score and _corner_list to nullptr.
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    NEHarrisCorners(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in, out] input                 Source image. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      threshold             Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]      min_dist              Radial Euclidean distance for the euclidean diatance stage
-     * @param[in]      sensitivity           Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]      gradient_size         The gradient window size to use on the input. The implementation supports 3, 5, and 7
-     * @param[in]      block_size            The block window size used to compute the Harris Corner score. The implementation supports 3, 5, and 7.
-     * @param[out]     corners               Array of keypoints to store the results.
-     * @param[in]      border_mode           Border mode to use
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(IImage *input, float threshold, float min_dist, float sensitivity,
-                   int32_t gradient_size, int32_t block_size, KeyPointArray *corners,
-                   BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                           _memory_group;          /**< Function's memory group */
-    std::unique_ptr<IFunction>            _sobel;                 /**< Sobel function */
-    std::unique_ptr<INEHarrisScoreKernel> _harris_score;          /**< Harris score kernel */
-    NENonMaximaSuppression3x3             _non_max_suppr;         /**< Non-maxima suppression function */
-    CPPCornerCandidatesKernel             _candidates;            /**< Sort kernel */
-    CPPSortEuclideanDistanceKernel        _sort_euclidean;        /**< Euclidean distance kernel */
-    NEFillBorderKernel                    _border_gx;             /**< Border handler before running harris score */
-    NEFillBorderKernel                    _border_gy;             /**< Border handler before running harris score */
-    Image                                 _gx;                    /**< Source image - Gx component */
-    Image                                 _gy;                    /**< Source image - Gy component */
-    Image                                 _score;                 /**< Source image - Harris score */
-    Image                                 _nonmax;                /**< Source image - Non-Maxima suppressed image */
-    std::vector<InternalKeypoint>         _corners_list;          /**< Array of InternalKeypoint. It stores the potential corner candidates */
-    int32_t                               _num_corner_candidates; /**< Number of potential corner candidates */
-};
-}
-#endif /*ARM_COMPUTE_NEHARRISCORNERS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEHistogram.h b/arm_compute/runtime/NEON/functions/NEHistogram.h
deleted file mode 100644
index e1a5e42de9..0000000000
--- a/arm_compute/runtime/NEON/functions/NEHistogram.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHISTOGRAM_H
-#define ARM_COMPUTE_NEHISTOGRAM_H
-
-#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class IDistribution1D;
-
-/** Basic function to run @ref NEHistogramKernel. */
-class NEHistogram : public IFunction
-{
-public:
-    /** Default Constructor. */
-    NEHistogram();
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  input  Input image. Data type supported: U8.
-     * @param[out] output Output distribution.
-     */
-    void configure(const IImage *input, IDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEHistogramKernel     _histogram_kernel;
-    std::vector<uint32_t> _local_hist;
-    std::vector<uint32_t> _window_lut;
-    size_t                _local_hist_size;
-    /** 256 possible pixel values as we handle only U8 images */
-    static constexpr unsigned int window_lut_default_size = 256;
-};
-}
-#endif /*ARM_COMPUTE_NEHISTOGRAM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEIm2Col.h b/arm_compute/runtime/NEON/functions/NEIm2Col.h
deleted file mode 100644
index cb905a3652..0000000000
--- a/arm_compute/runtime/NEON/functions/NEIm2Col.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEIM2COL_H
-#define ARM_COMPUTE_NEIM2COL_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NEIm2ColKernel */
-class NEIm2Col : public IFunction
-{
-public:
-    /** Default constructor */
-    NEIm2Col();
-    /** Configure the im2col NEON kernel
-     *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs.
-     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                         Note: QASYMM8 works only for has_bias = false
-     * @param[out] output      The output tensor. Data types supported: Same as @p input
-     * @param[in]  kernel_dims The kernel dimensions (width and height).
-     * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                   unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2Col
-     *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8 works only for has_bias = false
-     * @param[in] output      The output tensor. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEIm2ColKernel _kernel;
-    unsigned int   _y_dim;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEIM2COL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
index e128ec5835..0bc57be09e 100644
--- a/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -37,6 +36,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NEInstanceNormalizationLayerKernel;
 
 /** Basic function to perform a Instance normalization.
  *
@@ -48,8 +48,28 @@ class NEInstanceNormalizationLayer : public IFunction
 public:
     /** Constructor */
     NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayer(const NEInstanceNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEInstanceNormalizationLayer &operator=(const NEInstanceNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEInstanceNormalizationLayer(NEInstanceNormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEInstanceNormalizationLayer &operator=(NEInstanceNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NEInstanceNormalizationLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
+     *
      * @param[in, out] input   Source tensor. In case of @p output tensor = nullptr this tensor will store the result of the normalization.
      *                         Data types supported: F16/F32. Data layout supported: NHWC, NCHW
      * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
@@ -69,19 +89,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float gamma = 1.0f, float beta = 0.0f, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           float              gamma   = 1.0f,
+                           float              beta    = 0.0f,
+                           float              epsilon = 1e-12f);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                        _memory_group;
-    NEInstanceNormalizationLayerKernel _normalization_kernel;
-    bool                               _is_nchw;
-    NEPermute                          _permute_input;
-    NEPermute                          _permute_output;
-    Tensor                             _permuted_input;
-    Tensor                             _permuted_output;
+    MemoryGroup                                         _memory_group;
+    std::unique_ptr<NEInstanceNormalizationLayerKernel> _normalization_kernel;
+    bool                                                _is_nchw;
+    NEPermute                                           _permute_input;
+    NEPermute                                           _permute_output;
+    Tensor                                              _permuted_input;
+    Tensor                                              _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEIntegralImage.h b/arm_compute/runtime/NEON/functions/NEIntegralImage.h
deleted file mode 100644
index 2d7669d3ef..0000000000
--- a/arm_compute/runtime/NEON/functions/NEIntegralImage.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEINTEGRALIMAGE_H
-#define ARM_COMPUTE_NEINTEGRALIMAGE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run a @ref NEIntegralImageKernel */
-class NEIntegralImage : public INESimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[out] output Destination tensor. Data type supported: U32.
-     */
-    void configure(const ITensor *input, ITensor *output);
-};
-}
-#endif /*ARM_COMPUTE_NEINTEGRALIMAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
index a581600dbb..8502cee5d2 100644
--- a/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEL2NormalizeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEL2NORMALIZELAYER_H
 #define ARM_COMPUTE_NEL2NORMALIZELAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -36,6 +35,7 @@
 namespace arm_compute
 {
 class ITensor;
+class NEL2NormalizeLayerKernel;
 
 /** Basic function to perform a L2 normalization on a given axis.
  *
@@ -48,14 +48,34 @@ class NEL2NormalizeLayer : public IFunction
 public:
     /** Constructor */
     NEL2NormalizeLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEL2NormalizeLayer(const NEL2NormalizeLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEL2NormalizeLayer &operator=(const NEL2NormalizeLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEL2NormalizeLayer(NEL2NormalizeLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEL2NormalizeLayer &operator=(NEL2NormalizeLayer &&) = delete;
+    /** Default destructor */
+    ~NEL2NormalizeLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F16      |F16       |
+     * |F32      |F32       |
+     *
      * @param[in, out] input   Source tensor. Data types supported: F16/F32. (Written to only for border_size != 0)
      * @param[out]     output  Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      axis    Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]      epsilon (Optional) Lower bound value for the normalization.
      */
-    void configure(ITensor *input, ITensor *output, int axis, float epsilon = 1e-12f);
+    void configure(ITensor *input, ITensor *output, int axis, float epsilon = 1e-6f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEL2NormalizeLayer.
      *
@@ -66,16 +86,16 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int axis, float epsilon = 1e-12f);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, int axis, float epsilon = 1e-6f);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup              _memory_group;
-    NEReductionOperation     _reduce_func;
-    NEL2NormalizeLayerKernel _normalize_kernel;
-    Tensor                   _sumsq;
+    MemoryGroup                               _memory_group;
+    NEReductionOperation                      _reduce_func;
+    std::unique_ptr<NEL2NormalizeLayerKernel> _normalize_kernel;
+    Tensor                                    _sumsq;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEL2NORMALIZELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index e85e87b88e..629c5d10a0 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,18 @@
 #ifndef ARM_COMPUTE_NELSTMLAYER_H
 #define ARM_COMPUTE_NELSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
-#include "arm_compute/runtime/common/LSTMParams.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 namespace arm_compute
 {
@@ -49,8 +48,27 @@ class NELSTMLayer : public IFunction
 public:
     /** Default constructor */
     NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELSTMLayer(const NELSTMLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELSTMLayer &operator=(const NELSTMLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELSTMLayer(NELSTMLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELSTMLayer &operator=(NELSTMLayer &&) = delete;
+    /** Default destructor */
+    ~NELSTMLayer();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src13 | dst0 - dst3 |
+     * |:------------|:------------|
+     * |F16          |F16          |
+     * |F32          |F32          |
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: F16/F32.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input.
      * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, num_units]. Data type supported: Same as @p input.
@@ -86,13 +104,26 @@ public:
      * @param[in]  projection_threshold        The clipping threshold for the output from the projection layer, such that values are bound within [-proj_clip, proj_clip].
      *                                         If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *output_state_in, const ITensor *cell_state_in,
-                   ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                   const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *output_state_in,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *scratch_buffer,
+                   ITensor                   *output_state_out,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output,
+                   const LSTMParams<ITensor> &lstm_params,
+                   const ActivationLayerInfo &activation_info,
+                   float                      cell_threshold       = 0.f,
+                   float                      projection_threshold = 0.f);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -133,102 +164,115 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                           const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                           const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold = 0.f, float projection_threshold = 0.f);
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *scratch_buffer,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output,
+                           const LSTMParams<ITensorInfo> &lstm_params,
+                           const ActivationLayerInfo     &activation_info,
+                           float                          cell_threshold       = 0.f,
+                           float                          projection_threshold = 0.f);
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
 private:
-    MemoryGroup                     _memory_group;
-    NEFullyConnectedLayer           _fully_connected_input_gate;
-    NEArithmeticAddition            _accum_input_gate1;
-    NEArithmeticSubtractionKernel   _subtract_input_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate;
-    NEActivationLayerKernel         _activation_input_gate;
-    NEFullyConnectedLayer           _fully_connected_forget_gate;
-    NEArithmeticAddition            _accum_forget_gate1;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate;
-    NEActivationLayerKernel         _activation_forget_gate;
-    NEFullyConnectedLayer           _fully_connected_cell_state;
-    NEGEMM                          _gemm_cell_state1;
-    NETransposeKernel               _transpose_cell_state;
-    NEArithmeticAdditionKernel      _accum_cell_state1;
-    NEArithmeticAdditionKernel      _accum_cell_state2;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1;
-    NEActivationLayerKernel         _activation_cell_state;
-    NEActivationLayerKernel         _cell_clip;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state2;
-    NEFullyConnectedLayer           _fully_connected_output;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state1;
-    NEArithmeticAddition            _accum_output1;
-    NEActivationLayerKernel         _activation_output;
-    NEActivationLayerKernel         _activation_output_state;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_state2;
-    NEFullyConnectedLayer           _fully_connected_output_state;
-    NEActivationLayerKernel         _projection_clip;
-    NECopyKernel                    _copy_cell_state;
-    NECopyKernel                    _copy_output;
-    NEConcatenateLayer              _concat_scratch_buffer;
-    NEConcatenateLayer              _concat_inputs_forget_gate;
-    NEConcatenateLayer              _concat_weights_forget_gate;
-    NEConcatenateLayer              _concat_weights_input_gate;
-    NEConcatenateLayer              _concat_weights_output;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_input_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_input_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_forget_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_forget_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_cell_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_cell_gate_bias;
-    NEMeanStdDevNormalizationLayer  _mean_std_norm_output_gate;
-    NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_output_gate_bias;
-    Tensor                          _input_gate_out1;
-    Tensor                          _input_gate_out2;
-    Tensor                          _input_gate_out3;
-    Tensor                          _input_gate_out4;
-    Tensor                          _forget_gate_out1;
-    Tensor                          _forget_gate_out2;
-    Tensor                          _forget_gate_out3;
-    Tensor                          _forget_gate_out4;
-    Tensor                          _forget_gate_out5;
-    Tensor                          _forget_gate_out6;
-    Tensor                          _cell_state_out1;
-    Tensor                          _cell_state_out2;
-    Tensor                          _cell_state_out3;
-    Tensor                          _cell_state_out4;
-    Tensor                          _cell_state_out5;
-    Tensor                          _output1;
-    Tensor                          _output2;
-    Tensor                          _output3;
-    Tensor                          _output4;
-    Tensor                          _cell_state_activation;
-    Tensor                          _output_state1;
-    Tensor                          _ones;
-    Tensor                          _input_layer_norm_out1;
-    Tensor                          _input_layer_norm_out2;
-    Tensor                          _forget_layer_norm_out1;
-    Tensor                          _forget_layer_norm_out2;
-    Tensor                          _cell_layer_norm_out1;
-    Tensor                          _cell_layer_norm_out2;
-    Tensor                          _output_layer_norm_out1;
-    Tensor                          _output_layer_norm_out2;
-    bool                            _run_peephole_opt;
-    bool                            _run_cifg_opt;
-    bool                            _perform_cell_clipping;
-    bool                            _has_projection_weights;
-    bool                            _perform_projection_clipping;
-    bool                            _is_prepared;
-    bool                            _is_layer_norm_lstm;
+    MemoryGroup                    _memory_group;
+    NEFullyConnectedLayer          _fully_connected_input_gate;
+    NEArithmeticAddition           _accum_input_gate1;
+    NEArithmeticSubtraction        _subtract_input_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_input_gate;
+    NEActivationLayer              _activation_input_gate;
+    NEFullyConnectedLayer          _fully_connected_forget_gate;
+    NEArithmeticAddition           _accum_forget_gate1;
+    NEPixelWiseMultiplication      _pixelwise_mul_forget_gate;
+    NEActivationLayer              _activation_forget_gate;
+    NEFullyConnectedLayer          _fully_connected_cell_state;
+    NEGEMM                         _gemm_cell_state1;
+    NETranspose                    _transpose_cell_state;
+    NEArithmeticAddition           _accum_cell_state1;
+    NEArithmeticAddition           _accum_cell_state2;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_state1;
+    NEActivationLayer              _activation_cell_state;
+    NEActivationLayer              _cell_clip;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_state2;
+    NEFullyConnectedLayer          _fully_connected_output;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_state1;
+    NEArithmeticAddition           _accum_output1;
+    NEActivationLayer              _activation_output;
+    NEActivationLayer              _activation_output_state;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_state2;
+    NEFullyConnectedLayer          _fully_connected_output_state;
+    NEActivationLayer              _projection_clip;
+    NECopy                         _copy_cell_state;
+    NECopy                         _copy_output;
+    NEConcatenateLayer             _concat_scratch_buffer;
+    NEConcatenateLayer             _concat_inputs_forget_gate;
+    NEConcatenateLayer             _concat_weights_forget_gate;
+    NEConcatenateLayer             _concat_weights_input_gate;
+    NEConcatenateLayer             _concat_weights_output;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_input_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_input_gate_coeff;
+    NEArithmeticAddition           _accum_input_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_forget_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_forget_gate_coeff;
+    NEArithmeticAddition           _accum_forget_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_cell_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_cell_gate_coeff;
+    NEArithmeticAddition           _accum_cell_gate_bias;
+    NEMeanStdDevNormalizationLayer _mean_std_norm_output_gate;
+    NEPixelWiseMultiplication      _pixelwise_mul_output_gate_coeff;
+    NEArithmeticAddition           _accum_output_gate_bias;
+    Tensor                         _input_gate_out1;
+    Tensor                         _input_gate_out2;
+    Tensor                         _input_gate_out3;
+    Tensor                         _input_gate_out4;
+    Tensor                         _forget_gate_out1;
+    Tensor                         _forget_gate_out2;
+    Tensor                         _forget_gate_out3;
+    Tensor                         _forget_gate_out4;
+    Tensor                         _forget_gate_out5;
+    Tensor                         _forget_gate_out6;
+    Tensor                         _cell_state_out1;
+    Tensor                         _cell_state_out2;
+    Tensor                         _cell_state_out3;
+    Tensor                         _cell_state_out4;
+    Tensor                         _cell_state_out5;
+    Tensor                         _output1;
+    Tensor                         _output2;
+    Tensor                         _output3;
+    Tensor                         _output4;
+    Tensor                         _cell_state_activation;
+    Tensor                         _output_state1;
+    Tensor                         _ones;
+    Tensor                         _input_layer_norm_out1;
+    Tensor                         _input_layer_norm_out2;
+    Tensor                         _forget_layer_norm_out1;
+    Tensor                         _forget_layer_norm_out2;
+    Tensor                         _cell_layer_norm_out1;
+    Tensor                         _cell_layer_norm_out2;
+    Tensor                         _output_layer_norm_out1;
+    Tensor                         _output_layer_norm_out2;
+    bool                           _run_peephole_opt;
+    bool                           _run_cifg_opt;
+    bool                           _perform_cell_clipping;
+    bool                           _has_projection_weights;
+    bool                           _perform_projection_clipping;
+    bool                           _is_prepared;
+    bool                           _is_layer_norm_lstm;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NELSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
index 2f3b8fd336..ae951669b3 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NELSTMLAYERQUANTIZED_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
@@ -38,8 +39,6 @@
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
-
 namespace arm_compute
 {
 // Forward declarations
@@ -47,10 +46,10 @@ class ITensor;
 
 /** Basic function to run @ref NELSTMLayerQuantized
  *
- * This function calls the following NEON functions/kernels:
+ * This function calls the following functions/kernels:
  *
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
+ * -# @ref NEGEMMLowpOutputStage                                 Convert 32-bit integers into QSYMM16
  * -# @ref NETranspose                                           Matrix transpose
  * -# @ref NEConcatenateLayer                                    Tensor concatenation
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
@@ -67,14 +66,24 @@ public:
     NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELSTMLayerQuantized(const NELSTMLayerQuantized &) = delete;
-    /** Default move constructor */
-    NELSTMLayerQuantized(NELSTMLayerQuantized &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NELSTMLayerQuantized(NELSTMLayerQuantized &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NELSTMLayerQuantized &operator=(const NELSTMLayerQuantized &) = delete;
-    /** Default move assignment operator */
-    NELSTMLayerQuantized &operator=(NELSTMLayerQuantized &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NELSTMLayerQuantized &operator=(NELSTMLayerQuantized &&) = delete;
+    /** Default destructor */
+    ~NELSTMLayerQuantized();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0 - src8 |src9 - src12 |src13   |src14  |dst0   |dst1   |
+     * |:-----------|:------------|:-------|:------|:------|:------|
+     * |QASYMM8     |S32          |QSYMM16 |QASYMM8|QSYMM16|QASYMM8|
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8.
      * @param[in]  input_to_input_weights      2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, output_size]. Data type supported: Same as @p input.
@@ -94,11 +103,22 @@ public:
      * @param[out] output_state_out            Destination tensor. Output is a 2D tensor with dimensions [output_size, batch_size].Data types supported: Same as @p input.
      */
     void configure(const ITensor *input,
-                   const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out);
+                   const ITensor *input_to_input_weights,
+                   const ITensor *input_to_forget_weights,
+                   const ITensor *input_to_cell_weights,
+                   const ITensor *input_to_output_weights,
+                   const ITensor *recurrent_to_input_weights,
+                   const ITensor *recurrent_to_forget_weights,
+                   const ITensor *recurrent_to_cell_weights,
+                   const ITensor *recurrent_to_output_weights,
+                   const ITensor *input_gate_bias,
+                   const ITensor *forget_gate_bias,
+                   const ITensor *cell_bias,
+                   const ITensor *output_gate_bias,
+                   ITensor       *cell_state_in,
+                   const ITensor *output_state_in,
+                   ITensor       *cell_state_out,
+                   ITensor       *output_state_out);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NELSTMLayer
      *
@@ -123,11 +143,22 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out);
+                           const ITensorInfo *input_to_input_weights,
+                           const ITensorInfo *input_to_forget_weights,
+                           const ITensorInfo *input_to_cell_weights,
+                           const ITensorInfo *input_to_output_weights,
+                           const ITensorInfo *recurrent_to_input_weights,
+                           const ITensorInfo *recurrent_to_forget_weights,
+                           const ITensorInfo *recurrent_to_cell_weights,
+                           const ITensorInfo *recurrent_to_output_weights,
+                           const ITensorInfo *input_gate_bias,
+                           const ITensorInfo *forget_gate_bias,
+                           const ITensorInfo *cell_bias,
+                           const ITensorInfo *output_gate_bias,
+                           const ITensorInfo *cell_state_in,
+                           const ITensorInfo *output_state_in,
+                           const ITensorInfo *cell_state_out,
+                           const ITensorInfo *output_state_out);
 
     // Inherited methods overridden:
     void run() override;
@@ -137,30 +168,30 @@ private:
     MemoryGroup _memory_group;
 
     // Functions used
-    NEGEMMLowpMatrixMultiplyCore                        _gemmlowp;
-    NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint _output_stage;
-    NETranspose                                         _transpose_weights;
-    NEConcatenateLayer                                  _concat_input_weights;
-    NEConcatenateLayer                                  _concat_recurrent_weights;
-    NEConcatenateLayer                                  _concat_weights;
-    NEConcatenateLayer                                  _concat_inputs;
-    NEConcatenateLayer                                  _concat_bias;
-    NEActivationLayer                                   _sigmoid_forget_gate;
-    NEActivationLayer                                   _sigmoid_input_gate;
-    NEActivationLayer                                   _sigmoid_output_gate;
-    NEActivationLayer                                   _tanh_modulation_gate;
-    NEActivationLayer                                   _tanh_output_state;
-    NEArithmeticAddition                                _add1;
-    NEArithmeticAddition                                _add2;
-    NEPixelWiseMultiplication                           _mul1;
-    NEPixelWiseMultiplication                           _mul2;
-    NEPixelWiseMultiplication                           _mul3;
-    NESlice                                             _slice_input_tensor;
-    NESlice                                             _slice_forget_tensor;
-    NESlice                                             _slice_cell_tensor;
-    NESlice                                             _slice_output_tensor;
-    NEDequantizationLayer                               _dequantize;
-    NEQuantizationLayer                                 _quantize;
+    NEGEMMLowpMatrixMultiplyCore _gemmlowp;
+    NEGEMMLowpOutputStage        _output_stage;
+    NETranspose                  _transpose_weights;
+    NEConcatenateLayer           _concat_input_weights;
+    NEConcatenateLayer           _concat_recurrent_weights;
+    NEConcatenateLayer           _concat_weights;
+    NEConcatenateLayer           _concat_inputs;
+    NEConcatenateLayer           _concat_bias;
+    NEActivationLayer            _sigmoid_forget_gate;
+    NEActivationLayer            _sigmoid_input_gate;
+    NEActivationLayer            _sigmoid_output_gate;
+    NEActivationLayer            _tanh_modulation_gate;
+    NEActivationLayer            _tanh_output_state;
+    NEArithmeticAddition         _add1;
+    NEArithmeticAddition         _add2;
+    NEPixelWiseMultiplication    _mul1;
+    NEPixelWiseMultiplication    _mul2;
+    NEPixelWiseMultiplication    _mul3;
+    NESlice                      _slice_input_tensor;
+    NESlice                      _slice_forget_tensor;
+    NESlice                      _slice_cell_tensor;
+    NESlice                      _slice_output_tensor;
+    NEDequantizationLayer        _dequantize;
+    NEQuantizationLayer          _quantize;
 
     // Tensor pointers
     const ITensor *_input_to_input_weights;
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h b/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
deleted file mode 100644
index 5389f67bad..0000000000
--- a/arm_compute/runtime/NEON/functions/NELaplacianPyramid.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NELAPLACIANPYRAMID_H
-#define ARM_COMPUTE_NELAPLACIANPYRAMID_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussian5x5.h"
-#include "arm_compute/runtime/NEON/functions/NEGaussianPyramid.h"
-#include "arm_compute/runtime/Pyramid.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute laplacian pyramid. This function calls the following NEON kernels and functions:
- *
- * -# @ref NEGaussianPyramidHalf
- * -# @ref NEGaussian5x5
- * -# @ref NEArithmeticSubtraction
- *
- *  First a Gaussian pyramid is created. Then, for each level i, the corresponding tensor I(i) is blurred with the Gaussian 5x5 filter, and then
- *  difference between the two tensors is the corresponding level L(i) of the Laplacian pyramid.
- *  L(i) = I(i) - Gaussian5x5(I(i))
- *  Level 0 has always the same first two dimensions as the input tensor.
-*/
-class NELaplacianPyramid : public IFunction
-{
-public:
-    /** Constructor */
-    NELaplacianPyramid();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[out] pyramid               Destination pyramid tensors, Data type supported at each level: S16.
-     * @param[out] output                The lowest resolution tensor necessary to reconstruct the input tensor from the pyramid. Data type supported: S16.
-     *                                   The first two dimensions of this tensor must match the first two dimensions of the tensor in the last level of the pyramid, that is:
-     *                                   out.width = in.width() / pow(2,pyramid_levels-1) and out.height = in.height() / pow(2,pyramid_levels-1)
-     * @param[in]  border_mode           Border mode to use.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const ITensor *input, IPyramid *pyramid, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    size_t                               _num_levels;
-    NEGaussianPyramidHalf                _gaussian_pyr_function;
-    std::vector<NEGaussian5x5>           _convf;
-    std::vector<NEArithmeticSubtraction> _subf;
-    Pyramid                              _gauss_pyr;
-    Pyramid                              _conv_pyr;
-    NEDepthConvertLayer                  _depth_function;
-};
-}
-#endif /*ARM_COMPUTE_NELAPLACIANPYRAMID_H */
diff --git a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h b/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
deleted file mode 100644
index f939725d51..0000000000
--- a/arm_compute/runtime/NEON/functions/NELaplacianReconstruct.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H
-#define ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
-#include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEScale.h"
-#include "arm_compute/runtime/Pyramid.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Basic function to execute laplacian reconstruction. This function calls the following NEON kernels and functions:
- *
- * -# @ref NEArithmeticAddition
- * -# @ref NEScale
- * -# @ref NEDepthConvertLayer
- *
- * This function reconstructs the original image from a Laplacian Image Pyramid.
- *
- *  The input image is added to the last level of the Laplacian pyramid L(n-2), the resulting image is upsampled to the
- *  resolution of the next pyramid level.
- *
- *  I(n-2) = upsample( input + L(n-1)
- *
- *  For each pyramid level i, except i=0 and i=n-1:
- *  I(i-1) = upsample(I(i) + L(i))
- *
- *  output = I(0) + L(0)
-*/
-class NELaplacianReconstruct : public IFunction
-{
-public:
-    /** Constructor */
-    NELaplacianReconstruct();
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * The Output image must have the same size as the first level of the pyramid.
-     * The Input image must have the same size as the last level of the pyramid.
-     *
-     * The idea is to reconstuct the original hi-res image from a low-res representation of it and the laplacian pyramid.
-     *
-     * @param[in]  pyramid               Laplacian pyramid tensors, Data type supported at each level: S16.
-     * @param[in]  input                 Source tensor. Data type supported: S16.
-     * @param[out] output                Output tensor. Data type supported: U8.
-     * @param[in]  border_mode           Border mode to use for the convolution.
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(const IPyramid *pyramid, ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    Pyramid                           _tmp_pyr;
-    std::vector<NEArithmeticAddition> _addf;
-    std::vector<NEScale>              _scalef;
-    NEDepthConvertLayer               _depthf;
-};
-}
-#endif /*ARM_COMPUTE_NELAPLACIANRECONSTRUCT_H */
diff --git a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
deleted file mode 100644
index b2f2b88fce..0000000000
--- a/arm_compute/runtime/NEON/functions/NELocallyConnectedLayer.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H
-#define ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
-#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class INETensor;
-
-/** Basic function to compute the locally connected layer. This function calls the following NEON kernels:
- *
- * -# @ref NEWeightsReshapeKernel (executed only once for each configuration)
- * -# @ref NEIm2ColKernel
- * -# @ref NELocallyConnectedMatrixMultiplyKernel
- * -# @ref NECol2ImKernel
- */
-class NELocallyConnectedLayer : public IFunction
-{
-public:
-    /** Default constructor */
-    NELocallyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELocallyConnectedLayer(const NELocallyConnectedLayer &) = delete;
-    /** Default move constructor */
-    NELocallyConnectedLayer(NELocallyConnectedLayer &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NELocallyConnectedLayer &operator=(const NELocallyConnectedLayer &) = delete;
-    /** Default move assignment operator */
-    NELocallyConnectedLayer &operator=(NELocallyConnectedLayer &&) = default;
-    /** Set the input and output tensors.
-     *
-     * @param[in]  input     Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs.
-     *                       Data types supported: F16, F32.
-     * @param[in]  weights   Weights tensor. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in]  biases    Biases tensor. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[out] output    Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                       Data types supported: Same as @p input.
-     * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NELocallyConnectedLayer
-     *
-     * @param[in] input     Input tensor info. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs.
-     *                      Data types supported: F16, F32.
-     * @param[in] weights   Weights tensor info. Weights are 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in] biases    Biases tensor info. Shared biases supported. Biases are 2D tensor with dimensions [OFM, num_patches]. Data type supported:Same as @p input.
-     * @param[in] output    Output tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                      Data types supported: Same as @p input.
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info);
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    MemoryGroup                            _memory_group;
-    NEIm2ColKernel                         _input_im2col_kernel;
-    NEWeightsReshapeKernel                 _weights_reshape_kernel;
-    NELocallyConnectedMatrixMultiplyKernel _mm_kernel;
-    NECol2ImKernel                         _output_col2im_kernel;
-    Tensor                                 _input_im2col_reshaped;
-    Tensor                                 _weights_reshaped;
-    Tensor                                 _gemm_output;
-    bool                                   _is_prepared;
-    const ITensor                         *_original_weights;
-};
-}
-#endif /* ARM_COMPUTE_NELOCALLYCONNECTEDLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NELogical.h b/arm_compute/runtime/NEON/functions/NELogical.h
new file mode 100644
index 0000000000..0ad23200c6
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NELogical.h
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NELOGICAL_H
+#define ARM_COMPUTE_NELOGICAL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+
+/** Basic function to perform logical AND */
+class NELogicalAnd : public IFunction
+{
+public:
+    /** Constructor */
+    NELogicalAnd();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogicalAnd(const NELogicalAnd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELogicalAnd(NELogicalAnd &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogicalAnd &operator=(const NELogicalAnd &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELogicalAnd &operator=(NELogicalAnd &&) = delete;
+    /** Destructor */
+    ~NELogicalAnd();
+
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalAnd
+     *
+     * @param[in] input1 First input tensor info. Data types supported: U8.
+     * @param[in] input2 Second input tensor info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data type supported: U8
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to perform logical OR */
+class NELogicalOr : public IFunction
+{
+public:
+    /** Constructor */
+    NELogicalOr();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogicalOr(const NELogicalOr &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELogicalOr(NELogicalOr &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogicalOr &operator=(const NELogicalOr &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELogicalOr &operator=(NELogicalOr &&) = delete;
+    /** Destructor */
+    ~NELogicalOr();
+
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst          |
+     * |:--------------|:-------------|:------------|
+     * |U8             |U8            |U8           |
+     *
+     * @param[in]  input1 First tensor input. Data type supported: U8.
+     * @param[in]  input2 Second tensor input. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalOr
+     *
+     * @param[in] input1 First input tensor info. Data types supported: U8.
+     * @param[in] input2 Second input tensor info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data type supported: U8
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+/** Basic function to perform logical NOT */
+class NELogicalNot : public IFunction
+{
+public:
+    /** Constructor */
+    NELogicalNot();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogicalNot(const NELogicalNot &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELogicalNot(NELogicalNot &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NELogicalNot &operator=(const NELogicalNot &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NELogicalNot &operator=(NELogicalNot &&) = delete;
+    /** Destructor */
+    ~NELogicalNot();
+
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst           |
+     * |:--------------|:-------------|
+     * |U8             |U8            |
+     *
+     * @param[in]  input  Input tensor. Data type supported: U8.
+     * @param[out] output Output tensor. Data type supported: U8.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NELogicalNot
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8.
+     * @param[in] output Output tensor info. Data type supported: U8
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NELOGICAL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMagnitude.h b/arm_compute/runtime/NEON/functions/NEMagnitude.h
deleted file mode 100644
index 168500050e..0000000000
--- a/arm_compute/runtime/NEON/functions/NEMagnitude.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMAGNITUDE_H
-#define ARM_COMPUTE_NEMAGNITUDE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEMagnitudePhaseKernel */
-class NEMagnitude : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs.
-     *
-     * @param[in]  input1   First tensor input. Data type supported: S16.
-     * @param[in]  input2   Second tensor input. Data type supported: S16.
-     * @param[out] output   Output tensor. Data type supported: S16.
-     * @param[in]  mag_type (Optional) Magnitude calculation type. Default: L2NORM.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, MagnitudeType mag_type = MagnitudeType::L2NORM);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMAGNITUDE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMatMul.h b/arm_compute/runtime/NEON/functions/NEMatMul.h
new file mode 100644
index 0000000000..58dd7a6f6b
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMatMul.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+/** Settings for MatMul Cpu implementation*/
+class CpuMatMulSettings
+{
+public:
+    // get fast math flag
+    bool fast_math() const
+    {
+        return _fast_math;
+    }
+    // get fixed format flag
+    bool fixed_format() const
+    {
+        return _fixed_format;
+    }
+    // Set fast math flag
+    CpuMatMulSettings &fast_math(bool fmath)
+    {
+        _fast_math = fmath;
+        return *this;
+    }
+    // Set fixed format flag
+    CpuMatMulSettings &fixed_format(bool fixed_format)
+    {
+        _fixed_format = fixed_format;
+        return *this;
+    }
+
+private:
+    bool _fast_math{false};
+    bool _fixed_format{false};
+};
+
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class MatMulInfo;
+class Status;
+
+/** Basic function to run the following operators:
+ *
+ * -# @ref cpu::CpuMatMul
+ */
+class NEMatMul : public IFunction
+{
+public:
+    /** Constructor */
+    NEMatMul();
+    /** Destructor */
+    ~NEMatMul();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMatMul(const NEMatMul &) = delete;
+    /** Default move constructor */
+    NEMatMul(NEMatMul &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMatMul &operator=(const NEMatMul &) = delete;
+    /** Default move assignment operator */
+    NEMatMul &operator=(NEMatMul &&) = default;
+    /** Initialize
+     *
+     * Valid data layouts:
+     * - Any
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs                |dst            |
+     * |:--------------|:------------------|:--------------|
+     * |F32            |F32                |F32            |
+     * |F16            |F16                |F16            |
+     * |BFLOAT16       |BFLOAT16           |BFLOAT16       |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8            |QASYMM8        |
+     *
+     * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst      Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings Contains flags for function level settings i.e fast math
+     * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+     */
+    void configure(ITensor                   *lhs,
+                   ITensor                   *rhs,
+                   ITensor                   *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMatMul
+     *
+     * @param[in]  lhs      Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs      Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst      Output tensor info to store the result of the batched matrix multiplication. Data types supported: same as @p lhs / @p rhs.
+     * @param[in]  info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  settings Contains flags for function level settings i.e fast math
+     * @param[in]  act_info (Optional) Contains activation function and lower and upper bound values for bounded activation functions.
+     *
+     * @return Status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEMATMUL_H
diff --git a/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
new file mode 100644
index 0000000000..e00fc4544f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H
+#define ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class NEFill;
+
+/** Function to perform MaxUnpooling. This function calls the following kernels:
+ *
+ * -# @ref NEFill
+ */
+class NEMaxUnpoolingLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEMaxUnpoolingLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayer(const NEMaxUnpoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMaxUnpoolingLayer &operator=(const NEMaxUnpoolingLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMaxUnpoolingLayer(NEMaxUnpoolingLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMaxUnpoolingLayer &operator=(NEMaxUnpoolingLayer &&) = delete;
+    /** Default destructor */
+    ~NEMaxUnpoolingLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
+     * @note Only supported pool size 2
+     *
+     * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
+     * @param[out]     indices   The indices of the maximal values. Data type supported: U32.
+     * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     */
+    void configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEMaxUnpoolingLayer
+     *
+     * @note Only supported pool size 2
+     *
+     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] indices   The indices of the maximal values. Data type supported: U32.
+     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<NEFill> _fill_func;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEMAXUNPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h b/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
deleted file mode 100644
index 954b2228dd..0000000000
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDev.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEANSTDDEV_H
-#define ARM_COMPUTE_NEMEANSTDDEV_H
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-/** Basic function to execute mean and std deviation. This function calls the following NEON kernels:
- *
- * @ref NEMeanStdDevKernel
- *
- */
-class NEMeanStdDev : public IFunction
-{
-public:
-    /** Default Constructor. */
-    NEMeanStdDev();
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @param[in, out] input  Input image. Data types supported: U8. (Written to only for border filling)
-     * @param[out]     mean   Output average pixel value.
-     * @param[out]     stddev (Optional) Output standard deviation of pixel values.
-     */
-    void configure(IImage *input, float *mean, float *stddev = nullptr);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
-    NEFillBorderKernel _fill_border_kernel; /**< Kernel that fills tensor's borders with zeroes. */
-    uint64_t           _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
-    uint64_t           _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
-};
-}
-#endif /*ARM_COMPUTE_NEMEANSTDDEV_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
index 3ce2b2792b..41aa81946b 100644
--- a/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,13 +30,36 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to execute mean and standard deviation normalization by calling @ref NEMeanStdDevNormalizationKernel */
 class NEMeanStdDevNormalizationLayer : public INESimpleFunctionNoBorder
 {
 public:
+    /** Constructor */
+    NEMeanStdDevNormalizationLayer() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevNormalizationLayer(const NEMeanStdDevNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEMeanStdDevNormalizationLayer &operator=(const NEMeanStdDevNormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDevNormalizationLayer(NEMeanStdDevNormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEMeanStdDevNormalizationLayer &operator=(NEMeanStdDevNormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NEMeanStdDevNormalizationLayer();
     /** Initialise the function's input and outputs.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
+     *
      * @note If the output tensor is a nullptr, the normalization will be performed in-place.
      *
      * @param[in, out] input   Input tensor with 2 dimensions. Data types supported: F16/F32.
diff --git a/arm_compute/runtime/NEON/functions/NEMedian3x3.h b/arm_compute/runtime/NEON/functions/NEMedian3x3.h
deleted file mode 100644
index 55064f8a8c..0000000000
--- a/arm_compute/runtime/NEON/functions/NEMedian3x3.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEDIAN3x3_H
-#define ARM_COMPUTE_NEMEDIAN3x3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute median filter. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEMedian3x3Kernel
- *
- */
-class NEMedian3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor, Data type supported: U8.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEMEDIAN3x3_H */
diff --git a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h b/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
deleted file mode 100644
index 89b6874320..0000000000
--- a/arm_compute/runtime/NEON/functions/NEMinMaxLocation.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMINMAXLOCATION_H
-#define ARM_COMPUTE_NEMINMAXLOCATION_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Basic function to execute min and max location. This function calls the following NEON kernels:
- *
- * -# NEMinMaxKernel
- * -# NEMinMaxLocationKernel
- */
-class NEMinMaxLocation : public IFunction
-{
-public:
-    /** Constructor */
-    NEMinMaxLocation();
-    /** Initialise the kernel's inputs and outputs.
-     *
-     * @param[in]  input     Input image. Data types supported: U8/S16/F32.
-     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc   (Optional) Array of minimum value locations.
-     * @param[out] max_loc   (Optional) Array of maximum value locations.
-     * @param[out] min_count (Optional) Number of minimum value encounters.
-     * @param[out] max_count (Optional) Number of maximum value encounters.
-     */
-    void configure(const IImage *input, void *min, void *max,
-                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEMinMaxKernel         _min_max;     /**< Kernel that performs min/max */
-    NEMinMaxLocationKernel _min_max_loc; /**< Kernel that extracts min/max locations */
-};
-}
-#endif /*ARM_COMPUTE_NEMINMAXLOCATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h b/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
deleted file mode 100644
index a758e040c6..0000000000
--- a/arm_compute/runtime/NEON/functions/NENonLinearFilter.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONLINEARFILTER_H
-#define ARM_COMPUTE_NENONLINEARFILTER_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute non linear filter. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NENonLinearFilterKernel
- *
- * @note Supported mask dimensions squares of sizes 3, 5
- */
-class NENonLinearFilter : public INESimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, conv and border_mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: U8
-     * @param[in]      function              Non linear function to perform
-     * @param[in]      mask_size             Mask size. Supported sizes: 3, 5
-     * @param[in]      pattern               Mask pattern
-     * @param[in]      mask                  The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, BorderMode border_mode,
-                   uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NENONLINEARFILTER_H */
diff --git a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h b/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
deleted file mode 100644
index cb8b202811..0000000000
--- a/arm_compute/runtime/NEON/functions/NENonMaximaSuppression3x3.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H
-#define ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute non-maxima suppression over a 3x3 window. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NENonMaximaSuppression3x3Kernel
- *
- */
-class NENonMaximaSuppression3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *       The constant values used with CONSTANT border mode is 0
-     *
-     * @param[in, out] input       Source tensor. Data type supported: U8/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output      Destination for the Non-Maxima suppressions 3x3. Data type supported: same as @p input
-     * @param[in]      border_mode Border mode to use for non-maxima suppression. The implementation supports just 2 border modes: UNDEFINED and CONSTANT
-     *
-     */
-    void configure(ITensor *input, ITensor *output, BorderMode border_mode);
-};
-}
-#endif /* ARM_COMPUTE_NENONMAXIMASUPPRESSION3X3_H */
diff --git a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
index af34147bfe..27e3fa674e 100644
--- a/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NENormalizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,11 @@
 #ifndef ARM_COMPUTE_NENORMALIZATIONLAYER_H
 #define ARM_COMPUTE_NENORMALIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -39,10 +36,11 @@
 namespace arm_compute
 {
 class ITensor;
+class NENormalizationLayerKernel;
 
-/** Basic function to compute a normalization layer. This function calls the following NEON kernels:
+/** Basic function to compute a normalization layer. This function calls the following kernels:
  *
- * -# @ref NEPixelWiseMultiplicationKernel
+ * -# @ref NEPixelWiseMultiplication
  * -# @ref NEFillBorderKernel
  * -# @ref NENormalizationLayerKernel
  *
@@ -52,8 +50,28 @@ class NENormalizationLayer : public IFunction
 public:
     /** Default constructor */
     NENormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayer(const NENormalizationLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NENormalizationLayer &operator=(const NENormalizationLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NENormalizationLayer(NENormalizationLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NENormalizationLayer &operator=(NENormalizationLayer &&) = delete;
+    /** Default destructor */
+    ~NENormalizationLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     * |F16      |F16       |
+     *
      * @param[in]  input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                       and an optional 4th dimension for batch of inputs. Data type supported: F16/F32. Data layouts supported: NCHW/NHWC.
      * @param[out] output    Destination with the same dimensions, data type, data layout and number of channels of  @p input
@@ -69,17 +87,17 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                     _memory_group;    /**< Function memory group */
-    NENormalizationLayerKernel      _norm_kernel;     /**< Normalization layer kernel */
-    NEPixelWiseMultiplicationKernel _multiply_kernel; /**< Pixel multiplication kernel */
-    NEFillBorderKernel              _border_handler;  /**< Kernel to handle  borders */
-    Tensor                          _input_squared;   /**< The intermediate buffer which stores results of squaring input */
+    MemoryGroup                                 _memory_group; /**< Function memory group */
+    std::unique_ptr<NENormalizationLayerKernel> _norm_kernel;  /**< Normalization layer kernel */
+    NEPixelWiseMultiplication                   _multiply_f;   /**< Pixel multiplication function */
+    Tensor _input_squared; /**< The intermediate buffer which stores results of squaring input */
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NENORMALIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h b/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
deleted file mode 100644
index 95068aaee0..0000000000
--- a/arm_compute/runtime/NEON/functions/NEOpticalFlow.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEOPTICALFLOW_H
-#define ARM_COMPUTE_NEOPTICALFLOW_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Array.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEScharr3x3.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class Pyramid;
-
-/** Array of LK Internel Keypoints */
-using LKInternalKeypointArray = Array<NELKInternalKeypoint>;
-/** Basic function to execute optical flow. This function calls the following NEON kernels and functions:
- *
- * -# @ref NEScharr3x3
- * -# @ref NELKTrackerKernel
- *
- */
-class NEOpticalFlow : public IFunction
-{
-public:
-    /** Constructor
-     *
-     * @param[in] memory_manager (Optional) Memory manager.
-     */
-    NEOpticalFlow(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEOpticalFlow(const NEOpticalFlow &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEOpticalFlow &operator=(const NEOpticalFlow &) = delete;
-    /**  Initialise the function input and output
-     *
-     * @param[in]  old_pyramid           Pointer to the pyramid for the old tensor. Data type supported U8
-     * @param[in]  new_pyramid           Pointer to the pyramid for the new tensor. Data type supported U8
-     * @param[in]  old_points            Pointer to the IKeyPointArray storing old key points
-     * @param[in]  new_points_estimates  Pointer to the IKeyPointArray storing new estimates key points
-     * @param[out] new_points            Pointer to the IKeyPointArray storing new key points
-     * @param[in]  termination           The criteria to terminate the search of each keypoint.
-     * @param[in]  epsilon               The error for terminating the algorithm
-     * @param[in]  num_iterations        The maximum number of iterations before terminate the alogrithm
-     * @param[in]  window_dimension      The size of the window on which to perform the algorithm
-     * @param[in]  use_initial_estimate  The flag to indicate whether the initial estimated position should be used
-     * @param[in]  border_mode           The border mode applied at scharr kernel stage
-     * @param[in]  constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT
-     *
-     */
-    void configure(const Pyramid *old_pyramid, const Pyramid *new_pyramid, const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates,
-                   IKeyPointArray *new_points, Termination termination, float epsilon, unsigned int num_iterations, size_t window_dimension,
-                   bool use_initial_estimate, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    MemoryGroup                    _memory_group;
-    std::vector<NEScharr3x3>       _func_scharr;
-    std::vector<NELKTrackerKernel> _kernel_tracker;
-    std::vector<Tensor>            _scharr_gx;
-    std::vector<Tensor>            _scharr_gy;
-    IKeyPointArray                *_new_points;
-    const IKeyPointArray          *_new_points_estimates;
-    const IKeyPointArray          *_old_points;
-    LKInternalKeypointArray        _new_points_internal;
-    LKInternalKeypointArray        _old_points_internal;
-    unsigned int                   _num_levels;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEOPTICALFLOW_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
index 102a165383..81d5fd162c 100644
--- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,21 +25,47 @@
 #define ARM_COMPUTE_NEPRELULAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEArithmeticOperationKernel for PRELU
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
-class NEPReluLayer : public INESimpleFunction
+class NEPReluLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEPReluLayer();
+    /** Default Destructor */
+    ~NEPReluLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPReluLayer(const NEPReluLayer &) = delete;
+    /** Default move constructor */
+    NEPReluLayer(NEPReluLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPReluLayer &operator=(const NEPReluLayer &) = delete;
+    /** Default move assignment operator */
+    NEPReluLayer &operator=(NEPReluLayer &&);
     /** Set the input and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  alpha  Source alpha tensor. Data types supported: same of @p input.
      * @param[out] output Destination tensor. Data type supported: same as @p input
@@ -54,6 +80,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPRELULAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPadLayer.h b/arm_compute/runtime/NEON/functions/NEPadLayer.h
index d3074e70bc..494b1c0641 100644
--- a/arm_compute/runtime/NEON/functions/NEPadLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPadLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,24 +24,26 @@
 #ifndef ARM_COMPUTE_NEPADLAYER_H
 #define ARM_COMPUTE_NEPADLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 #include "arm_compute/runtime/SubTensor.h"
-
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPadLayerKernel.h"
-#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
-/** Basic function to pad a tensor. This function calls the following NEON functions/kernels:
+class NEPadLayerKernel;
+
+/** Basic function to pad a tensor. This function calls the following functions/kernels:
  *
  *  - For padding mode = PaddingMode::CONSTANT:
  *      -# @ref NEPadLayerKernel
  *  - Otherwise:
- *      -# @ref NECopyKernel
+ *      -# @ref NECopy
  *      -# @ref NEStridedSlice
  *      -# @ref NEConcatenateLayer
  *
@@ -49,10 +51,29 @@ namespace arm_compute
 class NEPadLayer : public IFunction
 {
 public:
-    /** Default constructor*/
+    /** Default Constructor */
     NEPadLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPadLayer(const NEPadLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPadLayer &operator=(const NEPadLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPadLayer(NEPadLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPadLayer &operator=(NEPadLayer &&) = delete;
+    /** Default destructor */
+    ~NEPadLayer();
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |All      |All       |
+     *
      * @param[in]  input          Source tensor. Data types supported: All.
      * @param[out] output         Output tensor. Data type supported: same as @p input
      * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair padding[i]
@@ -61,7 +82,11 @@ public:
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -74,7 +99,11 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run() override;
@@ -88,7 +117,10 @@ private:
      *                            specifies the front and the end padding in the i-th dimension.
      * @param[in]  constant_value Constant value to be used for the padding
      */
-    void configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value);
+    void configure_constant_mode(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value);
     /** Configure functions for when reflect or symmetric padding is used.
      *
      * @param[in]  input  Source tensor. Data types supported: All.
@@ -97,15 +129,15 @@ private:
     void configure_reflect_symmetric_mode(ITensor *input, ITensor *output);
 
 private:
-    NECopyKernel                    _copy_kernel;
-    NEPadLayerKernel                _pad_kernel;
-    PaddingMode                     _mode;
-    PaddingList                     _padding;
-    uint32_t                        _num_dimensions;
-    std::vector<NEStridedSlice>     _slice_functions;
-    std::vector<NEConcatenateLayer> _concat_functions;
-    std::vector<Tensor>             _slice_results;
-    std::vector<Tensor>             _concat_results;
+    NECopy                            _copy_function;
+    std::unique_ptr<NEPadLayerKernel> _pad_kernel;
+    PaddingMode                       _mode;
+    PaddingList                       _padding;
+    uint32_t                          _num_dimensions;
+    std::vector<NEStridedSlice>       _slice_functions;
+    std::vector<NEConcatenateLayer>   _concat_functions;
+    std::vector<Tensor>               _slice_results;
+    std::vector<Tensor>               _concat_results;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEPADLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPermute.h b/arm_compute/runtime/NEON/functions/NEPermute.h
index 4651b30e8e..2cef64764d 100644
--- a/arm_compute/runtime/NEON/functions/NEPermute.h
+++ b/arm_compute/runtime/NEON/functions/NEPermute.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,43 @@
 #ifndef ARM_COMPUTE_NEPERMUTE_H
 #define ARM_COMPUTE_NEPERMUTE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEPermuteKernel */
-class NEPermute : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::kernels::CpuPermuteKernel */
+class NEPermute : public IFunction
 {
 public:
-    /** Configure the permute NEON kernel
+    /** Default Constructor */
+    NEPermute();
+    /** Default Destructor */
+    ~NEPermute();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPermute(const NEPermute &) = delete;
+    /** Default move constructor */
+    NEPermute(NEPermute &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPermute &operator=(const NEPermute &) = delete;
+    /** Default move assignment operator */
+    NEPermute &operator=(NEPermute &&) = default;
+    /** Configure the permute function
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
      *
      * @note Arbitrary permutation vectors are supported with rank not greater than 4
      *
@@ -57,6 +80,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPERMUTE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPhase.h b/arm_compute/runtime/NEON/functions/NEPhase.h
deleted file mode 100644
index 220681e9f1..0000000000
--- a/arm_compute/runtime/NEON/functions/NEPhase.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEPHASE_H
-#define ARM_COMPUTE_NEPHASE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEMagnitudePhaseKernel */
-class NEPhase : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs, output.
-     *
-     * @param[in]  input1     First tensor input. Data type supported: S16.
-     * @param[in]  input2     Second tensor input. Data type supported: S16.
-     * @param[out] output     Output tensor. Data type supported: U8.
-     * @param[in]  phase_type (Optional) Phase calculation type. Default: SIGNED.
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, PhaseType phase_type = PhaseType::SIGNED);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEPHASE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
index 2b31032931..3d81bf6087 100644
--- a/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
+++ b/arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,25 +24,60 @@
 #ifndef ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
 #define ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H
 
+#include "arm_compute/core/Rounding.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEPixelWiseMultiplicationKernel */
-class NEPixelWiseMultiplication : public INESimpleFunction
+/** Basic function to run @ref cpu::CpuMul */
+class NEPixelWiseMultiplication : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEPixelWiseMultiplication();
+    /** Default Destructor */
+    ~NEPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplication(const NEPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    NEPixelWiseMultiplication(NEPixelWiseMultiplication &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPixelWiseMultiplication &operator=(const NEPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    NEPixelWiseMultiplication &operator=(NEPixelWiseMultiplication &&) = default;
     /** Initialise the kernel's inputs, output and convertion policy.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QSYMM16        |QSYMM16        |QASYMM16       |
+     * |QSYMM16        |QSYMM16        |S32            |
+     * |U8             |U8             |U8             |
+     * |U8             |U8             |S16            |
+     * |U8             |S16            |S16            |
+     * |S16            |U8             |S16            |
+     * |S16            |S16            |S16            |
+     * |F16            |F16            |F16            |
+     * |F32            |S32            |F32            |
+     *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
+     * @param[in, out] input1          An input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
-     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in, out] input2          An input tensor. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if @p input1 is QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
      *                                 This input tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
      * @param[out]     output          Output tensor. Data types supported:
      *                                 - U8, only if both inputs are U8.
@@ -50,49 +85,80 @@ public:
      *                                 - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
      *                                 - S16.
      *                                 - QSYMM16, only if both inputs are QSYMM16.
-     *                                 - S32, only if both inputs are QSYMM16.
+     *                                 - S32, only if both inputs are S32 or both are QSYMM16.
      *                                 - F16, only if @p input1 is F16.
      *                                 - F32, only if both inputs are F32.
      * @param[in]      scale           Scale to apply after multiplication.
      *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                                 If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in]      overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(const ITensor             *input1,
+                   const ITensor             *input2,
+                   ITensor                   *output,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEPixelWiseMultiplication
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
      *
-     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32
-     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] input1          An input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/S32/QSYMM16/F16/F32
+     * @param[in] input2          An input tensor info. Data types supported: U8, QASYMM8 (only if @p input1 is QASYMM8), QASYMM8_SIGNED (only if @p input1 is QASYMM8_SIGNED), S16, S32, QSYMM16 (only if both inputs are QSYMM16), F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
      * @param[in] output          Output tensor info. Data types supported:
      *                            - U8, only if both inputs are U8.
      *                            - QASYMM8, only if both inputs are QASYMM8.
      *                            - QASYMM8_SIGNED, only if @p input1 is QASYMM8_SIGNED.
      *                            - S16.
      *                            - QSYMM16, only if both inputs are QSYMM16.
-     *                            - S32, only if both inputs are QSYMM16.
+     *                            - S32, only if both inputs are S32 or both are QSYMM16.
      *                            - F16, only if @p input1 is F16.
      *                            - F32, only if both inputs are F32.
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
-     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if datatype is QASYMM8, QASYMM8_SIGNED or QSYMM16.
+     *                            If both @p input1, @p input2 and @p output are of datatype S32, scale cannot be 1/255
+     * @param[in] overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in] rounding_policy Rounding policy.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEComplexPixelWiseMultiplicationKernel. */
-class NEComplexPixelWiseMultiplication : public INESimpleFunction
+/** Basic function to run @ref cpu::CpuComplexMul. */
+class NEComplexPixelWiseMultiplication : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEComplexPixelWiseMultiplication();
+    /** Default Destructor */
+    ~NEComplexPixelWiseMultiplication();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComplexPixelWiseMultiplication(const NEComplexPixelWiseMultiplication &) = delete;
+    /** Default move constructor */
+    NEComplexPixelWiseMultiplication(NEComplexPixelWiseMultiplication &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEComplexPixelWiseMultiplication &operator=(const NEComplexPixelWiseMultiplication &) = delete;
+    /** Default move assignment operator */
+    NEComplexPixelWiseMultiplication &operator=(NEComplexPixelWiseMultiplication &&) = default;
     /** Initialise the kernel's inputs, output.
      *
      * @param[in, out] input1   An input tensor. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -102,7 +168,10 @@ public:
      * @param[out]     output   The output tensor. Data types supported: same as @p input1. Number of channels: same as @p input1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensor                   *input1,
+                   ITensor                   *input2,
+                   ITensor                   *output,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEComplexPixelWiseMultiplication
      *
      * @param[in] input1   An input tensor info. Data types supported: F32. Number of channels supported: 2 (complex tensor).
@@ -110,7 +179,17 @@ public:
      * @param[in] output   The output tensor info. Data types supported: same as @p input1. Number of channels supported: same as @p input1.
      * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_NEPIXELWISEMULTIPLICATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
new file mode 100644
index 0000000000..09251f2a5f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEPooling3dLayer.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEPOOLING3DLAYER_H
+#define ARM_COMPUTE_NEPOOLING3DLAYER_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class ITensorInfo;
+class IMemoryManager;
+/** Basic function to simulate a pooling 3d layer with the specified pooling operation. This function calls the following kernels:
+ *
+ * -# @ref cpu::CpuPool3d
+ */
+class NEPooling3dLayer : public IFunction
+{
+public:
+    /** Constructor */
+    NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPooling3dLayer(const NEPooling3dLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPooling3dLayer &operator=(const NEPooling3dLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPooling3dLayer(NEPooling3dLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPooling3dLayer &operator=(NEPooling3dLayer &&) = delete;
+    /** Default destructor */
+    ~NEPooling3dLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     *
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *
+     * @param[in]  input     Source tensor. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output    Destination tensor.
+     * @param[in]  pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ITensor *input, ITensor *output, const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEPooling3dLayer
+     *
+     *
+     * @param[in] input     Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output    Destination tensor info.
+     * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEPOOLING3DLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
index e43741c95b..768ad0d818 100644
--- a/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,29 +24,55 @@
 #ifndef ARM_COMPUTE_NEPOOLINGLAYER_H
 #define ARM_COMPUTE_NEPOOLINGLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following NEON kernels:
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following kernels:
  *
- * -# @ref NEFillBorderKernel (executed if padding size is different from zero)
- * -# @ref NEPoolingLayerKernel
+ * -# @ref cpu::CpuPool2d
  */
 class NEPoolingLayer : public IFunction
 {
 public:
     /** Constructor */
-    NEPoolingLayer();
+    NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayer(const NEPoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEPoolingLayer &operator=(const NEPoolingLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPoolingLayer(NEPoolingLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEPoolingLayer &operator=(NEPoolingLayer &&) = delete;
+    /** Default destructor */
+    ~NEPoolingLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note F16 is supported for pool sizes 2 and 3 only
+     * @note Source tensor is padded with -inf for MAX pooling and 0 otherwise
+     *       Cases where pooling region is completely outside input tensor are only supported for floating point data type
      *
      * @param[in, out] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[out]     output    Destination tensor. Data types supported: Same as @p input.
@@ -58,23 +84,24 @@ public:
      *
      * @note F16 is supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] output    Destination tensor. Data types supported: Same as @p input.
+     * @param[in] input     Source tensor info. (Written to only when padding != 0) Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * @param[in] indices   (optional) Tensor info of the indices of the maximal values. Data type supported: U32.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NEPoolingLayerKernel _pooling_layer_kernel;
-    NEFillBorderKernel   _border_handler;
-    bool                 _is_global_pooling_layer;
-    DataLayout           _data_layout;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
index 242460d3a9..858e3299af 100644
--- a/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPriorBoxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_NEPRIORBOXLAYER_H
 #define ARM_COMPUTE_NEPRIORBOXLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEPriorBoxLayerKernel. */
 class NEPriorBoxLayer : public INESimpleFunctionNoBorder
@@ -38,6 +38,15 @@ class NEPriorBoxLayer : public INESimpleFunctionNoBorder
 public:
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0     |src1     |dst      |
+     * |:--------|:--------|:--------|
+     * |F32      |F32      |F32      |
+     *
      * @param[in]  input1 First source tensor. Data types supported: F32. Data layouts supported: NCHW/NHWC.
      * @param[in]  input2 Second source tensor. Data types and layouts supported: same as @p input1
      * @param[out] output Destination tensor. Output dimensions are [W * H * num_priors * 4, 2]. Data type supported: same as @p input
@@ -53,7 +62,10 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEPRIORBOXLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index d1cc962940..009a4e0911 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 ARM Limited.
+ * Copyright (c) 2020-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,37 +24,46 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYER_H
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/common/LSTMParams.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
-#include "arm_compute/runtime/common/LSTMParams.h"
+#include <memory>
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
-
+class ITensorInfo;
+class NEQLSTMLayerNormalizationKernel;
+namespace cpu
+{
+namespace kernels
+{
+class CpuGemmLowpMatrixAReductionKernel;
+} // namespace kernels
+} // namespace cpu
 /** Basic function to run @ref NEQLSTMLayer
  *
- * This function calls the following NEON functions/kernels:
+ * This function calls the following kernels:
  *
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref NEArithmeticAdditionKernel                            Elementwise addition
- * -# @ref NEArithmeticSubtractionKernel                         Elementwise subtraction
- * -# @ref NECopyKernel                                          Copy kernel for copying output_state_out to output
+ * -# @ref NEArithmeticAddition                                  Elementwise addition
+ * -# @ref NEArithmeticSubtraction                               Elementwise subtraction
+ * -# @ref NECopy                                                Copy kernel for copying output_state_out to output
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
- * -# @ref NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint   Convert 32-bit integers into QSYMM16
- * -# @ref NEGEMMLowpMatrixAReductionKernel                      For precomputing effective biases to use
- * -# @ref NEPixelWiseMultiplicationKernel                       Elementwise multiplication
+ * -# @ref NEGEMMLowpOutputStage                                 Convert 32-bit integers into QSYMM16
+ * -# @ref cpu::kernels::CpuGemmLowpMatrixAReductionKernel            For precomputing effective biases to use
+ * -# @ref NEPixelWiseMultiplication                             Elementwise multiplication
  * -# @ref NETranspose                                           Transpose function for reshaping the weights
  * */
 class NEQLSTMLayer : public IFunction
@@ -64,14 +73,24 @@ public:
     NEQLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEQLSTMLayer(const NEQLSTMLayer &) = delete;
-    /** Default move constructor */
-    NEQLSTMLayer(NEQLSTMLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEQLSTMLayer(NEQLSTMLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NEQLSTMLayer &operator=(const NEQLSTMLayer &) = delete;
-    /** Default move assignment operator */
-    NEQLSTMLayer &operator=(NEQLSTMLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NEQLSTMLayer &operator=(NEQLSTMLayer &&) = delete;
+    /** Default destructor */
+    ~NEQLSTMLayer();
     /** Initialize function's tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0          |src1 - src6  |src7 -src9   |src10  |src11         |dst0   |dst1 - dst2       |
+     * |:-------------|:------------|:------------|:------|:-------------|:------|:-----------------|
+     * |QASYMM8_SIGNED|QASYMM8      |S32          |QSYMM16|QASYMM8_SIGNED|QSYMM16|QASYMM8_SIGNED    |
+     *
      * @param[in]  input                       Source tensor. Input is a 2D tensor with dimensions [input_size, batch_size]. Data types supported: QASYMM8_SIGNED.
      * @param[in]  input_to_forget_weights     2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
      * @param[in]  input_to_cell_weights       2D weights tensor with dimensions [input_size, num_units]. Data type supported: QSYMM8.
@@ -111,12 +130,21 @@ public:
      *                                         projection_threshold       (Optional) The clipping threshold for the output from the projection layer, such that values are bound within
      *                                                                               [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      */
-    void configure(const ITensor *input,
-                   const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                   const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                   const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                   const ITensor *cell_state_in, const ITensor *output_state_in,
-                   ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+    void configure(const ITensor             *input,
+                   const ITensor             *input_to_forget_weights,
+                   const ITensor             *input_to_cell_weights,
+                   const ITensor             *input_to_output_weights,
+                   const ITensor             *recurrent_to_forget_weights,
+                   const ITensor             *recurrent_to_cell_weights,
+                   const ITensor             *recurrent_to_output_weights,
+                   const ITensor             *forget_gate_bias,
+                   const ITensor             *cell_bias,
+                   const ITensor             *output_gate_bias,
+                   const ITensor             *cell_state_in,
+                   ITensor                   *output_state_in,
+                   ITensor                   *cell_state_out,
+                   ITensor                   *output_state_out,
+                   ITensor                   *output,
                    const LSTMParams<ITensor> &lstm_params);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer
@@ -161,12 +189,21 @@ public:
      *                                                                              [-proj_clip, proj_clip]. If set to 0.0 then clipping is disabled.
      * @return a status
      */
-    static Status validate(const ITensorInfo *input,
-                           const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                           const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                           const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                           const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                           const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+    static Status validate(const ITensorInfo             *input,
+                           const ITensorInfo             *input_to_forget_weights,
+                           const ITensorInfo             *input_to_cell_weights,
+                           const ITensorInfo             *input_to_output_weights,
+                           const ITensorInfo             *recurrent_to_forget_weights,
+                           const ITensorInfo             *recurrent_to_cell_weights,
+                           const ITensorInfo             *recurrent_to_output_weights,
+                           const ITensorInfo             *forget_gate_bias,
+                           const ITensorInfo             *cell_bias,
+                           const ITensorInfo             *output_gate_bias,
+                           const ITensorInfo             *cell_state_in,
+                           const ITensorInfo             *output_state_in,
+                           const ITensorInfo             *cell_state_out,
+                           const ITensorInfo             *output_state_out,
+                           const ITensorInfo             *output,
                            const LSTMParams<ITensorInfo> &lstm_params);
 
     // Inherited methods overridden:
@@ -199,24 +236,33 @@ private:
      * @param[in] mm_res_info    Tensor info to be used to initialize output stage result tensor.
      *
      */
-    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                      const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias, Tensor *mm_res,
-                      Tensor *outstage_res, float gemmlowp_scale,
-                      const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info);
+    void configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                      NEGEMMLowpOutputStage        &outstage,
+                      GEMMLowpOutputStageInfo      &gemmlowp_info,
+                      const ITensor                *mm_input,
+                      const ITensor                *mm_weights,
+                      const ITensor                *bias,
+                      Tensor                       *mm_res,
+                      Tensor                       *outstage_res,
+                      float                         gemmlowp_scale,
+                      const TensorInfo             &mm_res_info,
+                      const TensorInfo             &outstage_tensor_info);
 
-    MemoryGroup _memory_group{};
+    MemoryGroup _memory_group;
 
     /** A small internel kernel do the copy between two tensors */
     class TensorCopyKernel
     {
         static constexpr uint32_t max_dimension_supported = 2;
 
-        ITensor *_src{ nullptr };
-        ITensor *_dst{ nullptr };
+        ITensor *_src{nullptr};
+        ITensor *_dst{nullptr};
         size_t   _row_size{};
         Window   _window{};
 
     public:
+        /** Destructor */
+        ~TensorCopyKernel();
         /** Static function to check if given info will lead to a valid configuration of @ref NEQLSTMLayer::TensorCopyKernel
          *
          * @param[in] src Source tensor info.
@@ -236,93 +282,96 @@ private:
     };
 
     // Functions used
-    NETranspose                      _transpose_input_to_forget_weights{};
-    NETranspose                      _transpose_input_to_cell_weights{};
-    NETranspose                      _transpose_input_to_output_weights{};
-    NETranspose                      _transpose_input_to_input_weights{};
-    NETranspose                      _transpose_recurrent_to_forget_weights{};
-    NETranspose                      _transpose_recurrent_to_cell_weights{};
-    NETranspose                      _transpose_recurrent_to_output_weights{};
-    NETranspose                      _transpose_recurrent_to_input_weights{};
-    NETranspose                      _transpose_projection_weights{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_input_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_forget_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_cell_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
-    NEGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    NEArithmeticAdditionKernel       _projection_bias_add{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_forget{};
-    NEGEMMLowpOutputStage            _input_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
-    NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_forget{};
-    NEArithmeticAdditionKernel       _accumulate_cell_forget{};
-    NEActivationLayer                _forget_gate_sigmoid{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
-    NEGEMMLowpOutputStage            _input_to_cell_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
-    NEGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_modulation{};
-    NEActivationLayer                _cell_gate_tanh{};
-    NEArithmeticSubtractionKernel    _input_gate_sub{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
-    NEGEMMLowpOutputStage            _input_to_input_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
-    NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_input{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_input{};
-    NEGEMMLowpOutputStage            _cell_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_input{};
-    NEActivationLayer                _input_gate_sigmoid{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_forget_cell{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_input_cell{};
-    NEArithmeticAdditionKernel       _add_forget_cell{};
-    NEActivationLayer                _cell_clip{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
-    NEGEMMLowpOutputStage            _input_to_output_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
-    NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_output{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_output{};
-    NEGEMMLowpOutputStage            _cell_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_to_output{};
-    NEActivationLayer                _output_gate_sigmoid{};
-    NEActivationLayer                _hidden_tanh{};
-    NEPixelWiseMultiplicationKernel  _pixelwise_mul_hidden{};
-    NEGEMMLowpOutputStage            _hidden_outstage{};
-    NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
-    NEGEMMLowpOutputStage            _projection_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_projection{};
-    NEActivationLayer                _projection_clip{};
 
-    TensorCopyKernel _projection_bias_copy{};
-    TensorCopyKernel _projection_output_to_accumulate_copy{};
-    TensorCopyKernel _projection_accumulate_to_output_copy{};
-    TensorCopyKernel _hidden_to_output_copy{};
+    NEDequantizationLayer                                            _dequantize_input_to_forget_weights;
+    NEQuantizationLayer                                              _quantize_input_to_forget_weights;
+    NETranspose                                                      _transpose_input_to_forget_weights;
+    NETranspose                                                      _transpose_input_to_cell_weights;
+    NETranspose                                                      _transpose_input_to_output_weights;
+    NETranspose                                                      _transpose_input_to_input_weights;
+    NETranspose                                                      _transpose_recurrent_to_forget_weights;
+    NETranspose                                                      _transpose_recurrent_to_cell_weights;
+    NETranspose                                                      _transpose_recurrent_to_output_weights;
+    NETranspose                                                      _transpose_recurrent_to_input_weights;
+    NETranspose                                                      _transpose_projection_weights;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_input_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_input_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_forget_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_forget_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_cell_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_cell_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _input_to_output_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _recurrent_to_output_reduction;
+    std::unique_ptr<cpu::kernels::CpuGemmLowpMatrixAReductionKernel> _projection_reduction;
+    NEArithmeticAddition                                             _projection_bias_add;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_forget;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_forget;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_forget;
+    NEGEMMLowpOutputStage                                            _input_to_forget_outstage;
+    NEGEMMLowpOutputStage                                            _recurrent_to_forget_outstage;
+    NEGEMMLowpOutputStage                                            _cell_to_forget_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_forget;
+    NEArithmeticAddition                                             _accumulate_cell_forget;
+    NEActivationLayer                                                _forget_gate_sigmoid;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_cell;
+    NEGEMMLowpOutputStage                                            _input_to_cell_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_cell;
+    NEGEMMLowpOutputStage                                            _recurrent_to_cell_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_modulation;
+    NEActivationLayer                                                _cell_gate_tanh;
+    NEArithmeticSubtraction                                          _input_gate_sub;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_input;
+    NEGEMMLowpOutputStage                                            _input_to_input_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_input;
+    NEGEMMLowpOutputStage                                            _recurrent_to_input_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_input;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_input;
+    NEGEMMLowpOutputStage                                            _cell_to_input_outstage;
+    NEArithmeticAddition                                             _accumulate_cell_input;
+    NEActivationLayer                                                _input_gate_sigmoid;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_forget_cell;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_input_cell;
+    NEArithmeticAddition                                             _add_forget_cell;
+    NEActivationLayer                                                _cell_clip;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_input_to_output;
+    NEGEMMLowpOutputStage                                            _input_to_output_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_recurrent_to_output;
+    NEGEMMLowpOutputStage                                            _recurrent_to_output_outstage;
+    NEArithmeticAddition                                             _accumulate_input_recurrent_output;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_cell_to_output;
+    NEGEMMLowpOutputStage                                            _cell_to_output_outstage;
+    NEArithmeticAddition                                             _accumulate_cell_to_output;
+    NEActivationLayer                                                _output_gate_sigmoid;
+    NEActivationLayer                                                _hidden_tanh;
+    NEPixelWiseMultiplication                                        _pixelwise_mul_hidden;
+    NEGEMMLowpOutputStage                                            _hidden_outstage;
+    NEGEMMLowpMatrixMultiplyCore                                     _mm_projection;
+    NEGEMMLowpOutputStage                                            _projection_outstage;
+    NEArithmeticAddition                                             _accumulate_projection;
+    NEActivationLayer                                                _projection_clip;
+
+    TensorCopyKernel _projection_bias_copy;
+    TensorCopyKernel _projection_output_to_accumulate_copy;
+    TensorCopyKernel _projection_accumulate_to_output_copy;
+    TensorCopyKernel _hidden_to_output_copy;
 
-    std::array<NEQLSTMLayerNormalizationKernel, _layer_norm_count> _layer_norms{ {} };
+    std::array<std::unique_ptr<NEQLSTMLayerNormalizationKernel>, _layer_norm_count> _layer_norms;
 
-    NECopyKernel _copy_output{};
+    NECopy _copy_output;
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights{ nullptr };
-    const ITensor *_recurrent_to_input_weights{ nullptr };
-    const ITensor *_projection_bias{ nullptr };
-    const ITensor *_input_to_forget_weights{ nullptr };
-    const ITensor *_input_to_cell_weights{ nullptr };
-    const ITensor *_input_to_output_weights{ nullptr };
-    const ITensor *_recurrent_to_forget_weights{ nullptr };
-    const ITensor *_recurrent_to_cell_weights{ nullptr };
-    const ITensor *_recurrent_to_output_weights{ nullptr };
-    const ITensor *_projection_weights{ nullptr };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{ {} };
-    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{ {} };
+    const ITensor                                 *_input_to_input_weights{nullptr};
+    const ITensor                                 *_recurrent_to_input_weights{nullptr};
+    const ITensor                                 *_projection_bias{nullptr};
+    const ITensor                                 *_input_to_forget_weights{nullptr};
+    const ITensor                                 *_input_to_cell_weights{nullptr};
+    const ITensor                                 *_input_to_output_weights{nullptr};
+    const ITensor                                 *_recurrent_to_forget_weights{nullptr};
+    const ITensor                                 *_recurrent_to_cell_weights{nullptr};
+    const ITensor                                 *_recurrent_to_output_weights{nullptr};
+    const ITensor                                 *_projection_weights{nullptr};
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_weights{};
+    std::array<const ITensor *, _layer_norm_count> _layer_norm_bias{};
 
     using LayerNormIndexType = typename std::underlying_type<LayerNormGate>::type;
     inline LayerNormIndexType getGateIndex(LayerNormGate g)
@@ -350,99 +399,87 @@ private:
         return _layer_norm_bias[getGateIndex(g)];
     }
 
-    inline NEQLSTMLayerNormalizationKernel &get_layer_norm(LayerNormGate g)
+    inline std::unique_ptr<NEQLSTMLayerNormalizationKernel> &get_layer_norm(LayerNormGate g)
     {
         return _layer_norms[getGateIndex(g)];
     }
 
-    inline void configure_layer_norm(LayerNormGate g, const ITensor *in)
-    {
-        ARM_COMPUTE_ERROR_ON(!_has_layer_norm);
-
-        Tensor &out = get_layer_norm_output(g);
-        _memory_group.manage(&out);
-        out.allocator()->init(*(in->info()));
-
-        get_layer_norm(g).configure(in, &out, get_layer_norm_weight(g), get_layer_norm_bias(g));
-    }
-
-    inline static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias)
-    {
-        // Output quantization scale will be different, but ignored here
-        // since it will be configured at configure() stage.
-        const TensorInfo out{ in };
-        return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
-    }
+    void          configure_layer_norm(LayerNormGate g, const ITensor *in);
+    static Status validate_layer_norm(const ITensorInfo &in, const ITensorInfo &weight, const ITensorInfo &bias);
 
     // Temporary tensors
-    Tensor _input_to_forget_weights_transposed{ nullptr };
-    Tensor _input_to_cell_weights_transposed{ nullptr };
-    Tensor _input_to_output_weights_transposed{ nullptr };
-    Tensor _input_to_input_weights_transposed{ nullptr };
-    Tensor _recurrent_to_forget_weights_transposed{ nullptr };
-    Tensor _recurrent_to_cell_weights_transposed{ nullptr };
-    Tensor _recurrent_to_output_weights_transposed{ nullptr };
-    Tensor _recurrent_to_input_weights_transposed{ nullptr };
-    Tensor _projection_weights_transposed{ nullptr };
-    Tensor _input_to_input_eff_bias{ nullptr };
-    Tensor _recurrent_to_input_eff_bias{ nullptr };
-    Tensor _input_to_forget_eff_bias{ nullptr };
-    Tensor _recurrent_to_forget_eff_bias{ nullptr };
-    Tensor _input_to_cell_eff_bias{ nullptr };
-    Tensor _recurrent_to_cell_eff_bias{ nullptr };
-    Tensor _input_to_output_eff_bias{ nullptr };
-    Tensor _recurrent_to_output_eff_bias{ nullptr };
-    Tensor _projection_reduction_res{ nullptr };
-    Tensor _projection_eff_bias{ nullptr };
-    Tensor _mm_input_to_forget_res{ nullptr };
-    Tensor _mm_recurrent_to_forget_res{ nullptr };
-    Tensor _mul_cell_to_forget_res{ nullptr };
-    Tensor _input_to_forget_outstage_res{ nullptr };
-    Tensor _cell_to_forget_outstage_res{ nullptr };
-    Tensor _recurrent_to_forget_outstage_res{ nullptr };
-    Tensor _forget_gate{ nullptr };
-    Tensor _mm_input_to_cell_res{ nullptr };
-    Tensor _input_to_cell_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_cell_res{ nullptr };
-    Tensor _recurrent_to_cell_outstage_res{ nullptr };
-    Tensor _cell_gate{ nullptr };
-    Tensor _mul_input_cell_res{ nullptr };
-    Tensor _mm_input_to_input_res{ nullptr };
-    Tensor _input_to_input_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_input_res{ nullptr };
-    Tensor _mul_cell_to_input_res{ nullptr };
-    Tensor _cell_to_input_outstage_res{ nullptr };
-    Tensor _recurrent_to_input_outstage_res{ nullptr };
-    Tensor _input_gate{ nullptr };
-    Tensor _mm_input_to_output_res{ nullptr };
-    Tensor _input_to_output_outstage_res{ nullptr };
-    Tensor _mm_recurrent_to_output_res{ nullptr };
-    Tensor _mul_cell_to_output_res{ nullptr };
-    Tensor _cell_to_output_outstage_res{ nullptr };
-    Tensor _recurrent_to_output_outstage_res{ nullptr };
-    Tensor _output_gate{ nullptr };
-    Tensor _hidden_mul_res{ nullptr };
-    Tensor _hidden_gate{ nullptr };
-    Tensor _mm_projection_res{ nullptr };
-    Tensor _projection_outstage_res{ nullptr };
-    Tensor _projection_out_res{ nullptr };
-    Tensor _projection_accumulate_res{ nullptr };
-    Tensor _ones{ nullptr };
-    std::array<Tensor, _layer_norm_count> _layer_norm_output{ {} };
+    Tensor _input_to_forget_weights_f32{nullptr};
+    Tensor _input_to_forget_weights_symm8{nullptr};
+
+    Tensor                                _input_to_forget_weights_transposed{nullptr};
+    Tensor                                _input_to_cell_weights_transposed{nullptr};
+    Tensor                                _input_to_output_weights_transposed{nullptr};
+    Tensor                                _input_to_input_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_forget_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_cell_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_output_weights_transposed{nullptr};
+    Tensor                                _recurrent_to_input_weights_transposed{nullptr};
+    Tensor                                _projection_weights_transposed{nullptr};
+    Tensor                                _input_to_input_eff_bias{nullptr};
+    Tensor                                _recurrent_to_input_eff_bias{nullptr};
+    Tensor                                _input_to_forget_eff_bias{nullptr};
+    Tensor                                _recurrent_to_forget_eff_bias{nullptr};
+    Tensor                                _input_to_cell_eff_bias{nullptr};
+    Tensor                                _recurrent_to_cell_eff_bias{nullptr};
+    Tensor                                _input_to_output_eff_bias{nullptr};
+    Tensor                                _recurrent_to_output_eff_bias{nullptr};
+    Tensor                                _projection_reduction_res{nullptr};
+    Tensor                                _projection_eff_bias{nullptr};
+    Tensor                                _mm_input_to_forget_res{nullptr};
+    Tensor                                _mm_recurrent_to_forget_res{nullptr};
+    Tensor                                _mul_cell_to_forget_res{nullptr};
+    Tensor                                _input_to_forget_outstage_res{nullptr};
+    Tensor                                _cell_to_forget_outstage_res{nullptr};
+    Tensor                                _recurrent_to_forget_outstage_res{nullptr};
+    Tensor                                _forget_gate{nullptr};
+    Tensor                                _mm_input_to_cell_res{nullptr};
+    Tensor                                _input_to_cell_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_cell_res{nullptr};
+    Tensor                                _recurrent_to_cell_outstage_res{nullptr};
+    Tensor                                _cell_gate{nullptr};
+    Tensor                                _mul_input_cell_res{nullptr};
+    Tensor                                _mm_input_to_input_res{nullptr};
+    Tensor                                _input_to_input_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_input_res{nullptr};
+    Tensor                                _mul_cell_to_input_res{nullptr};
+    Tensor                                _cell_to_input_outstage_res{nullptr};
+    Tensor                                _recurrent_to_input_outstage_res{nullptr};
+    Tensor                                _input_gate{nullptr};
+    Tensor                                _mm_input_to_output_res{nullptr};
+    Tensor                                _input_to_output_outstage_res{nullptr};
+    Tensor                                _mm_recurrent_to_output_res{nullptr};
+    Tensor                                _mul_cell_to_output_res{nullptr};
+    Tensor                                _cell_to_output_outstage_res{nullptr};
+    Tensor                                _recurrent_to_output_outstage_res{nullptr};
+    Tensor                                _output_gate{nullptr};
+    Tensor                                _hidden_mul_res{nullptr};
+    Tensor                                _hidden_gate{nullptr};
+    Tensor                                _mm_projection_res{nullptr};
+    Tensor                                _projection_outstage_res{nullptr};
+    Tensor                                _projection_out_res{nullptr};
+    Tensor                                _projection_accumulate_res{nullptr};
+    Tensor                                _ones{nullptr};
+    std::array<Tensor, _layer_norm_count> _layer_norm_output{};
 
     inline Tensor &get_layer_norm_output(LayerNormGate g)
     {
         return _layer_norm_output[getGateIndex(g)];
     }
 
-    bool _is_prepared{ false };
-    bool _has_cifg{ false };
-    bool _has_cell_clipping{ false };
-    bool _has_projection{ false };
-    bool _has_projection_clipping{ false };
-    bool _has_peephole{ false };
-    bool _has_layer_norm{ false };
-    bool _projection_tensor_copy_required{ false };
+    bool _is_prepared{false};
+    bool _has_cifg{false};
+    bool _has_cell_clipping{false};
+    bool _has_projection{false};
+    bool _has_projection_clipping{false};
+    bool _has_peephole{false};
+    bool _has_layer_norm{false};
+    bool _projection_tensor_copy_required{false};
+    bool _convert_input_to_forget_weights_to_qsymm8{false};
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQLSTMLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index fc317be81e..7bf97e28a5 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,30 +24,45 @@
 #ifndef ARM_COMPUTE_NEQUANTIZATIONLAYER_H
 #define ARM_COMPUTE_NEQUANTIZATIONLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
 
-#include "arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to simulate a quantization layer. This function calls the following NEON kernels:
- *
- *
- * -# @ref NEQuantizationLayerKernel
- *
- */
-class NEQuantizationLayer : public INESimpleFunctionNoBorder
+/** Basic function to run a quantization layer using @ref cpu::CpuQuantize */
+class NEQuantizationLayer : public IFunction
 {
 public:
-    /** Default constructor */
-    NEQuantizationLayer() = default;
+    NEQuantizationLayer();
+    /** Default Destructor */
+    ~NEQuantizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQuantizationLayer(const NEQuantizationLayer &) = delete;
+    /** Default move constructor */
+    NEQuantizationLayer(NEQuantizationLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQuantizationLayer &operator=(const NEQuantizationLayer &) = delete;
+    /** Default move assignment operator */
+    NEQuantizationLayer &operator=(NEQuantizationLayer &&) = default;
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src                |dst                                    |
+     * |:------------------|:--------------------------------------|
+     * |QASYMM8            |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |QASYMM8_SIGNED     |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |F16                |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     * |F32                |QASYMM8, QASYMM8_SIGNED, QASYMM16      |
+     *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
      * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
      */
@@ -60,6 +75,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQUANTIZATIONLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index 0bfb905e19..af7f464ac9 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,11 +24,10 @@
 #ifndef ARM_COMPUTE_NERNNLAYER_H
 #define ARM_COMPUTE_NERNNLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NECopy.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
@@ -45,14 +44,26 @@ public:
     NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NERNNLayer(const NERNNLayer &) = delete;
-    /** Default move constructor */
-    NERNNLayer(NERNNLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NERNNLayer(NERNNLayer &&) = delete;
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NERNNLayer &operator=(const NERNNLayer &) = delete;
-    /** Default move assignment operator */
-    NERNNLayer &operator=(NERNNLayer &&) = default;
+    /** Prevent instances of this class from being moved (As this class contains pointers) */
+    NERNNLayer &operator=(NERNNLayer &&) = delete;
+    /** Default destructor */
+    ~NERNNLayer();
     /** Initialize the function
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0   |src1   |src2   |src3   |dst0   |dst1   |
+     * |:------|:------|:------|:------|:------|:------|
+     * |F16    |F16    |F16    |F16    |F16    |F16    |
+     * |F32    |F32    |F32    |F32    |F32    |F32    |
+     *
      * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
      * @param[in]     weights           Weights tensor of shape [input_size, num_units] that multiplies the input. Data types supported: Same as @p input
      * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the current 'state'. Data types supported: Same as @p input
@@ -61,7 +72,13 @@ public:
      * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types supported: Same as @p input
      * @param[in]     info              Activation layer parameter.
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info);
+    void configure(const ITensor       *input,
+                   const ITensor       *weights,
+                   const ITensor       *recurrent_weights,
+                   const ITensor       *bias,
+                   ITensor             *hidden_state,
+                   ITensor             *output,
+                   ActivationLayerInfo &info);
     /** Initialize the function
      *
      * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data types supported: F16/F32
@@ -74,7 +91,12 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state, const ITensorInfo *output,
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *recurrent_weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *hidden_state,
+                           const ITensorInfo         *output,
                            const ActivationLayerInfo &info);
 
     // Inherited methods overridden:
@@ -82,16 +104,16 @@ public:
     void prepare() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEGEMM                     _gemm_state_f;
-    NEArithmeticAdditionKernel _add_kernel;
-    NEActivationLayerKernel    _activation_kernel;
-    NEFullyConnectedLayer      _fully_connected;
-    NECopyKernel               _copy_kernel;
-    Tensor                     _fully_connected_out;
-    Tensor                     _gemm_output;
-    Tensor                     _add_output;
-    bool                       _is_prepared;
+    MemoryGroup           _memory_group;
+    NEGEMM                _gemm_state_f;
+    NEArithmeticAddition  _add_f;
+    NEActivationLayer     _activation;
+    NEFullyConnectedLayer _fully_connected;
+    NECopy                _copy_f;
+    Tensor                _fully_connected_out;
+    Tensor                _gemm_output;
+    Tensor                _add_output;
+    bool                  _is_prepared;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERNNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
index 04a24ac7ec..b06ebe899d 100644
--- a/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIAlignLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,35 @@
 #ifndef ARM_COMPUTE_NEROIALIGNLAYER_H
 #define ARM_COMPUTE_NEROIALIGNLAYER_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEROIAlignLayerKernel.
- *
- * This function calls the following NEON kernels:
- * -# @ref NEROIAlignLayerKernel
- *
- */
-class NEROIAlignLayer : public INESimpleFunction
+/** Basic function to run @ref NEROIAlignLayerKernel. */
+class NEROIAlignLayer : public INESimpleFunctionNoBorder
 {
 public:
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |
+     * |QASYMM8        |QASYMM16       |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM16       |QASYMM8_SIGNED |
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ].
-     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8, otherwise same as @p input
+     *                       Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED, otherwise same as @p input
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
      *
@@ -56,8 +64,8 @@ public:
     void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEROIAlignLayerKernel
      *
-     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F16/F32.
-     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8,
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] rois      ROIs tensor info. Data types supported: QASYMM16 with scale of 0.125 and 0 offset if @p input is QASYMM8/QASYMM8_SIGNED,
      *                      otherwise same as @p input
      * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
@@ -69,7 +77,10 @@ public:
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEROIALIGNLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
index 887b5712da..929111ad4b 100644
--- a/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,29 +24,46 @@
 #ifndef ARM_COMPUTE_NEROIPOOLINGLAYER_H
 #define ARM_COMPUTE_NEROIPOOLINGLAYER_H
 
+#include "arm_compute/core/IArray.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NEROIPoolingLayerKernel;
+class ROIPoolingLayerInfo;
 
-/** Basic function to run @ref NEROIPoolingLayerKernel.
- *
- * This function calls the following NEON kernels:
- * -# @ref NEROIPoolingLayerKernel
- *
- */
+/** Basic function to run @ref NEROIPoolingLayerKernel. */
 class NEROIPoolingLayer : public IFunction
 {
 public:
     /** Constructor */
     NEROIPoolingLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIPoolingLayer(const NEROIPoolingLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEROIPoolingLayer &operator=(const NEROIPoolingLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEROIPoolingLayer(NEROIPoolingLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEROIPoolingLayer &operator=(NEROIPoolingLayer &&) = delete;
+    /** Default destructor */
+    ~NEROIPoolingLayer();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data types supported: F32.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |U16            |F32            |
+     * |QASYMM8        |U16            |QASYMM8        |
+     *
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F32
      * @param[in]  rois      ROIs tensor, it is a 2D tensor of size [5, N] (where N is the number of ROIs) containing top left and bottom right corner
      *                       as coordinate of an image and batch_id of ROI [ batch_id, x1, y1, x2, y2 ]. Data types supported: U16
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
@@ -57,13 +74,32 @@ public:
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run() override;
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEROIPoolingLayerKernel
+     *
+     * @param[in] input     Source tensor info. Data types supported: QASYMM8/F32.
+     * @param[in] rois      TensorInfo for rois tensor which is a 2D tensor of size [5,N] (where 5 is the number ROIs). Data types supported: U16
+     * @param[in] output    Destination tensor info. Data types supported: Same as @p input.
+     * @param[in] pool_info Contains pooling operation information described in @ref ROIPoolingLayerInfo.
+     *
+     * @note The x and y dimensions of @p output tensor must be the same as that specified by @p pool_info 's pooled
+     * width and pooled height.
+     * @note The z dimensions of @p output tensor and @p input tensor must be the same.
+     * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
+     * @return a Status
+     */
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
+
 private:
-    NEROIPoolingLayerKernel _roi_kernel;
+    std::unique_ptr<NEROIPoolingLayerKernel> _roi_kernel;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEROIPOOLINGLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NERange.h b/arm_compute/runtime/NEON/functions/NERange.h
index 83ca625aa7..609456a4ef 100644
--- a/arm_compute/runtime/NEON/functions/NERange.h
+++ b/arm_compute/runtime/NEON/functions/NERange.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,26 +24,55 @@
 #ifndef ARM_COMPUTE_NERANGE_H
 #define ARM_COMPUTE_NERANGE_H
 
-#include "arm_compute/core/NEON/kernels/NERangeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NERangeKernel;
 
 /** Basic function to run @ref NERangeKernel
  *
  * @note The tensor data type for the output must be U8/S8/U16/S16/U32/S32/F16/F32.
  * @note The function performs generates a sequence with the given start, end and step.
+ *
  */
 class NERange : public IFunction
 {
 public:
     /** Default constructor */
     NERange();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERange(const NERange &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NERange &operator=(const NERange &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NERange(NERange &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NERange &operator=(NERange &&) = delete;
+    /** Default destructor */
+    ~NERange();
     /** Initialize the kernel's start, end, step and output tensor.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |dst       |
+     * |:---------|
+     * |U8        |
+     * |S8        |
+     * |U16       |
+     * |S16       |
+     * |U32       |
+     * |S32       |
+     * |F16       |
+     * |F32       |
+     *
      * @param[out] output Output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  start  The starting value of the sequence.
      * @param[in]  end    The ending (not including) value of the sequence.
@@ -65,7 +94,7 @@ public:
     void run() override;
 
 private:
-    NERangeKernel _kernel;
+    std::unique_ptr<NERangeKernel> _kernel;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERANGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReduceMean.h b/arm_compute/runtime/NEON/functions/NEReduceMean.h
index 3c7cc21929..5b8d8cdf2b 100644
--- a/arm_compute/runtime/NEON/functions/NEReduceMean.h
+++ b/arm_compute/runtime/NEON/functions/NEReduceMean.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,8 @@
 #ifndef ARM_COMPUTE_NEON_REDUCE_MEAN_H
 #define ARM_COMPUTE_NEON_REDUCE_MEAN_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
@@ -41,8 +39,29 @@ class NEReduceMean : public IFunction
 public:
     /** Constructor */
     NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReduceMean(const NEReduceMean &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReduceMean &operator=(const NEReduceMean &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReduceMean(NEReduceMean &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReduceMean &operator=(NEReduceMean &&) = delete;
+    /** Default destructor */
+    ~NEReduceMean();
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  input          Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32
@@ -61,7 +80,8 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
+    static Status
+    validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEReductionOperation.h b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
index abda4159ba..f5391a6d0e 100644
--- a/arm_compute/runtime/NEON/functions/NEReductionOperation.h
+++ b/arm_compute/runtime/NEON/functions/NEReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,20 +25,19 @@
 #define ARM_COMPUTE_NEREDUCTIONOPERATION_H
 
 #include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 #include "arm_compute/runtime/Tensor.h"
 
+#include <memory>
+
 namespace arm_compute
 {
 class ITensor;
+class NEReductionOperationKernel;
 
-/** Basic function to simulate a reduction operation. This function calls the following NEON kernels:
+/** Basic function to simulate a reduction operation. This function calls the following kernels:
  *
- * -# @ref NEFillBorderKernel
+ * -# @ref NEReshapeLayer
  * -# @ref NEReductionOperationKernel
  *
  */
@@ -47,19 +46,41 @@ class NEReductionOperation : public IFunction
 public:
     /** Default constructor */
     NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReductionOperation(const NEReductionOperation &) = delete;
+    /** Default move constructor */
+    NEReductionOperation(NEReductionOperation &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReductionOperation &operator=(const NEReductionOperation &) = delete;
+    /** Default move assignment operator */
+    NEReductionOperation &operator=(NEReductionOperation &&) = default;
+    /** Default destructor */
+    ~NEReductionOperation();
     /** Set the input and output tensors.
      *
-     * @param[in]  input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
-     * @param[out] output    Destination tensor. Data types and data layouts supported: same as @p input.
-     * @param[in]  axis      Dimension along which to reduce. Supported reduction axis : 0
-     * @param[in]  op        Reduction operation to perform.
-     * @param[in]  keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |S32            |S32            |
+     *
+     * @param[in, out] input     Source tensor. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32. (Written to only for border_size != 0)
+     * @param[out]     output    Destination tensor. Data types and data layouts supported: same as @p input.
+     * @param[in]      axis      Dimension along which to reduce. Supported reduction axis : 0
+     * @param[in]      op        Reduction operation to perform.
+     * @param[in]      keep_dims (Optional) Whether to keep the reduced dimension after the operation. Defaults to true.
      */
     void configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReductionOperation.
      *
-     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32. Data layouts supported: NCHW. (Written to only for border_size != 0)
+     * @param[in] input     Source tensor info. Data type supported: QASYMM8_SIGNED/QASYMM8/F16/F32/S32.
      * @param[in] output    Destination tensor info. Data types and data layouts supported: same as @p input.
      * @param[in] axis      Dimension along which to reduce. Supported reduction axis : 0
      * @param[in] op        Reduction operation to perform.
@@ -67,20 +88,23 @@ public:
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims = true);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           unsigned int       axis,
+                           ReductionOperation op,
+                           bool               keep_dims = true);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEReductionOperationKernel _reduction_kernel;
-    NEFillBorderKernel         _fill_border_kernel;
-    NEReshapeLayerKernel       _reshape_kernel;
-    Tensor                     _output_internal;
-    size_t                     _window_split;
-    int                        _reduction_axis;
-    bool                       _is_reshape_required;
+    MemoryGroup                                 _memory_group;
+    std::unique_ptr<NEReductionOperationKernel> _reduction_kernel;
+    NEReshapeLayer                              _reshape;
+    Tensor                                      _output_internal;
+    size_t                                      _window_split;
+    int                                         _reduction_axis;
+    bool                                        _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEREDUCTIONOPERATION_H */
diff --git a/arm_compute/runtime/NEON/functions/NERemap.h b/arm_compute/runtime/NEON/functions/NERemap.h
deleted file mode 100644
index 05a7a8ffd6..0000000000
--- a/arm_compute/runtime/NEON/functions/NERemap.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAP_H
-#define ARM_COMPUTE_NEREMAP_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute remap. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NERemapKernel
- */
-class NERemap : public INESimpleFunction
-{
-public:
-    /** Initialise the function's sources, destination, interpolation policy and border mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in]      map_x                 Map for X coordinates. Data type supported: F32.
-     * @param[in]      map_y                 Map for Y coordinates. Data type supported: F32.
-     * @param[out]     output                Output tensor. Data type supported: U8.
-     * @param[in]      policy                Interpolation policy to use. Only NEAREST and BILINEAR are supported.
-     * @param[in]      border_mode           Border mode to use on the input tensor.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output,
-                   InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEREMAP_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReorderLayer.h b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
new file mode 100644
index 0000000000..e3fa7b9c16
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEReorderLayer.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__aarch64__)
+
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+
+namespace arm_compute
+{
+class ITensor;
+class ITensorInfo;
+class NEReorderKernel;
+/** Function to compute blocked reorder. */
+class NEReorderLayer : public IFunction
+{
+public:
+    /** Default constructor */
+    NEReorderLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorderLayer(const NEReorderLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReorderLayer &operator=(const NEReorderLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReorderLayer(NEReorderLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEReorderLayer &operator=(NEReorderLayer &&) = delete;
+    /** Default destructor */
+    ~NEReorderLayer();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src      |dst       |
+     * |:--------|:---------|
+     * |F32      |F32       |
+     *
+     * @param[in]  input     Source tensor. Data type supported: F32. Data layouts supported: NCHW.
+     * @param[out] output    Destination with the same dimensions, data type, data layout as  @p input
+     *                       except last dimension of data layout which needs to be multiple of blocking parameter ksize
+     * @param[in]  input_wf  WeightFormat of input.
+     * @param[in]  output_wf WeightFormat of output.
+     */
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEReorderLayer
+     *
+     * Similar to @ref NEReorderLayer::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    std::unique_ptr<NEReorderKernel> _reorder_kernel; /**< Reorder layer kernel */
+};
+} // namespace arm_compute
+#endif /* ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREORDERLAYER */
+
+#endif // defined(__aarch64__)
diff --git a/arm_compute/runtime/NEON/functions/NEReorgLayer.h b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
index 8ef7f8a1b2..0a7d824d10 100644
--- a/arm_compute/runtime/NEON/functions/NEReorgLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReorgLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEReorgLayerKernel */
 class NEReorgLayer : public INESimpleFunctionNoBorder
@@ -38,6 +39,15 @@ class NEReorgLayer : public INESimpleFunctionNoBorder
 public:
     /** Initialise the kernel's inputs and outputs
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input  First tensor input. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      * @param[in]  stride Stride to be used during data re-organization
diff --git a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
index d6643842d9..3e6e33f797 100644
--- a/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEReshapeLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,32 +25,61 @@
 #define ARM_COMPUTE_NERESHAPELAYER_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/runtime/Types.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
 
-/** Basic function to run @ref NEReshapeLayerKernel */
-class NEReshapeLayer : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::kernels::CpuReshapeKernel */
+class NEReshapeLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEReshapeLayer();
+    /** Default Destructor */
+    ~NEReshapeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayer(const NEReshapeLayer &) = delete;
+    /** Default move constructor */
+    NEReshapeLayer(NEReshapeLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEReshapeLayer &operator=(const NEReshapeLayer &) = delete;
+    /** Default move assignment operator */
+    NEReshapeLayer &operator=(NEReshapeLayer &&);
     /** Initialise the kernel's inputs and outputs
      *
-     * @param[in]  input  First tensor input. Data type supported: All
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
+     * @param[in]  input  Input tensor. Data type supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReshapeLayer
      *
-     * @param[in] input  First tensor info. Data type supported: All
+     * @param[in] input  Input tensor info. Data type supported: All
      * @param[in] output Output tensor info. Data type supported: Same as @p input
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NERESHAPELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEReverse.h b/arm_compute/runtime/NEON/functions/NEReverse.h
index ab5a5d0869..e03e415068 100644
--- a/arm_compute/runtime/NEON/functions/NEReverse.h
+++ b/arm_compute/runtime/NEON/functions/NEReverse.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREVERSE_H
-#define ARM_COMPUTE_NEREVERSE_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
 
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEReverseKernel */
 class NEReverse : public INESimpleFunctionNoBorder
@@ -38,20 +38,39 @@ class NEReverse : public INESimpleFunctionNoBorder
 public:
     /** Initialize the function
      *
-     * @param[in]  input  Input tensor. Data types supported: All
-     * @param[out] output Output tensor. Data type supported: Same as @p input
-     * @param[in]  axis   Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |All            |U32, S32       |All            |
+     *
+     * @param[in]  input             Input tensor. Data types supported: All
+     * @param[out] output            Output tensor. Data type supported: Same as @p input
+     * @param[in]  axis              Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in]  use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
+     *
+     * @note The value of each axis should be between [-rank, rank)
+     * @note If there are duplicate values in the tensor, the subsequent axis values are ignored. e.g. an array of [2, 2] has the same effects as [2].
+     *
+     * @deprecated Support for U32 in axis tensor will be removed in 24.02 release
+     *
      */
-    void configure(const ITensor *input, ITensor *output, const ITensor *axis);
+    void configure(const ITensor *input, ITensor *output, const ITensor *axis, const bool use_inverted_axis = false);
     /** Static function to check if given info will lead to a valid configuration of @ref NEReverseKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: All
-     * @param[in] output Output tensor info. Data type supported: Same as @p input
-     * @param[in] axis   Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32
+     * @param[in] input             Input tensor info. Data types supported: All
+     * @param[in] output            Output tensor info. Data type supported: Same as @p input
+     * @param[in] axis              Axis tensor info. Contains the indices of the dimensions to reverse. Data type supported: U32/S32
+     * @param[in] use_inverted_axis Reverse ACL axis indices convention, if true, (inverted)axis = (tensor_rank - 1) - axis
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           const bool         use_inverted_axis = false);
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEREVERSE_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEREVERSE_H
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index 5350d0646c..72dfa3bda4 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 ARM Limited.
+ * Copyright (c) 2016-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,65 +24,60 @@
 #ifndef ARM_COMPUTE_NESCALEIMAGE_H
 #define ARM_COMPUTE_NESCALEIMAGE_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
+#include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/Tensor.h"
 
-#include <cstdint>
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to run @ref NEScaleKernel */
+/** Basic function to compute Scale */
 class NEScale : public IFunction
 {
 public:
-    /** Constructor
-     *
-     * Initialize NEScale
-     */
+    /** Constructor */
     NEScale();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScale(const NEScale &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEScale(NEScale &&) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEScale &operator=(const NEScale &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEScale &operator=(NEScale &&) = delete;
+    /** Destructor */
+    ~NEScale();
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]      policy                The interpolation type.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]      use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]      align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
-    /** Initialize the function's source, destination, interpolation type and border_mode.
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     * |U8             |U8             |
+     * |S8             |S8             |
+     * |S16            |S16            |
      *
-     * @param[in, out] input  Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
-     */
-    void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEScale
-     *
-     * @param[in] input                 Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in] output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy                The interpolation type.
-     * @param[in] border_mode           Strategy to use for borders.
-     * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
      *
-     * @return a status
+     * @note Using S8 data type only supports NHWC, @p border_mode Replicate, and @p policy Bilinear
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode,
-                           PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
+    void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
-     * @param[in] input  Source tensor. Data type supported: U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] info   @ref ScaleKernelInfo to be used for validation
      *
@@ -94,13 +89,8 @@ public:
     void run() override;
 
 private:
-    Tensor             _offsets;        /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    Tensor             _dx;             /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor             _dy;             /**< Element's distance between the Y real coordinate and the smallest Y following integer */
-    NEScaleKernel      _scale_kernel;   /**< Kernel to perform the scaling */
-    NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */
-    bool               _use_padding;    /**< Is padding used on the tensors */
-    bool               _align_corners;  /**< Align corners of input and output */
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCALEIMAGE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEScharr3x3.h b/arm_compute/runtime/NEON/functions/NEScharr3x3.h
deleted file mode 100644
index 6091121e03..0000000000
--- a/arm_compute/runtime/NEON/functions/NEScharr3x3.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESCHARR3x3_H
-#define ARM_COMPUTE_NESCHARR3x3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute scharr 3x3 filter. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NEScharr3x3Kernel
- *
- */
-class NEScharr3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_x              (optional) Destination for the Scharr 3x3 convolution along the X axis. Data type supported: S16.
-     * @param[out]     output_y              (optional) Destination for the Scharr 3x3 convolution along the Y axis. Data type supported: S16.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NESCHARR3x3_H */
diff --git a/arm_compute/runtime/NEON/functions/NESelect.h b/arm_compute/runtime/NEON/functions/NESelect.h
index 6ac328080d..c8e5a204dd 100644
--- a/arm_compute/runtime/NEON/functions/NESelect.h
+++ b/arm_compute/runtime/NEON/functions/NESelect.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,19 +25,28 @@
 #define ARM_COMPUTE_NESELECT_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NESelect */
-class NESelect : public INESimpleFunction
+class NESelect : public INESimpleFunctionNoBorder
 {
 public:
     /** Initialise the kernel's inputs and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |U8             |All            |All    |All            |
+     *
      * @param[in]  c      Condition input tensor. Data types supported: U8.
      * @param[in]  x      First input tensor. Data types supported: All.
      * @param[in]  y      Second input tensor. Data types supported: Same as @p x
diff --git a/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h b/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h
deleted file mode 100644
index a162b6513b..0000000000
--- a/arm_compute/runtime/NEON/functions/NESimpleAssemblyFunction.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-
-#include "arm_compute/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */
-class NESimpleAssemblyFunction : public IFunction
-{
-public:
-    /** Constructor */
-    NESimpleAssemblyFunction();
-
-    /** Configure the function with the kernel to run
-     *
-     * @param[in] kernel GEMM Wrapper kernel configured and ready to run
-     *
-     * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension.
-     */
-    void configure(std::unique_ptr<INEGEMMWrapperKernel> kernel);
-
-    // Inherited methods overridden:
-    void run() override final;
-
-protected:
-    std::unique_ptr<INEGEMMWrapperKernel> _kernel; /**< Kernel to run */
-};
-} //namespace arm_compute
-#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NESlice.h b/arm_compute/runtime/NEON/functions/NESlice.h
index 834ec27a33..70a688d3b0 100644
--- a/arm_compute/runtime/NEON/functions/NESlice.h
+++ b/arm_compute/runtime/NEON/functions/NESlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,8 @@
 #ifndef ARM_COMPUTE_NE_SLICE_H
 #define ARM_COMPUTE_NE_SLICE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
@@ -32,11 +33,32 @@ namespace arm_compute
 class ITensor;
 
 /** Basic function to perform tensor slicing */
-class NESlice : public INESimpleFunctionNoBorder
+class NESlice : public IFunction
 {
 public:
+    /** Default Constructor */
+    NESlice();
+    /** Default Destructor */
+    ~NESlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESlice(const NESlice &) = delete;
+    /** Default move constructor */
+    NESlice(NESlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NESlice &operator=(const NESlice &) = delete;
+    /** Default move assignment operator */
+    NESlice &operator=(NESlice &&);
+
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @note Supported tensor rank: up to 4
      * @note Start indices must be non-negative. 0 <= starts[i]
      * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
@@ -63,7 +85,54 @@ public:
      *
      * @return A status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+/** Basic function to perform tensor slicing */
+class NESlice : public INEOperator
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in]  input  Source tensor info. Data type supported: All
+     * @param[out] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     */
+    void configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NESlice
+     *
+     * @note Supported tensor rank: up to 4
+     * @note Start indices must be non-negative. 0 <= starts[i]
+     * @note End coordinates can be negative, which represents the number of elements before the end of that dimension.
+     * @note End indices are not inclusive unless negative.
+     *
+     * @param[in] input  Source tensor info. Data type supported: All
+     * @param[in] output Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends   The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     *
+     * @return A status
+     */
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends);
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NE_SLICE_H */
diff --git a/arm_compute/runtime/NEON/functions/NESobel3x3.h b/arm_compute/runtime/NEON/functions/NESobel3x3.h
deleted file mode 100644
index 0cd633ec3a..0000000000
--- a/arm_compute/runtime/NEON/functions/NESobel3x3.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL3x3_H
-#define ARM_COMPUTE_NESOBEL3x3_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute sobel 3x3 filter. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NESobel3x3Kernel
- *
- */
-class NESobel3x3 : public INESimpleFunction
-{
-public:
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_x              (optional) Destination for the Sobel 3x3 convolution along the X axis. Data type supported: S16.
-     * @param[out]     output_y              (optional) Destination for the Sobel 3x3 convolution along the Y axis. Data type supported: S16.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NESOBEL3x3_H */
diff --git a/arm_compute/runtime/NEON/functions/NESobel5x5.h b/arm_compute/runtime/NEON/functions/NESobel5x5.h
deleted file mode 100644
index af52292359..0000000000
--- a/arm_compute/runtime/NEON/functions/NESobel5x5.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL5x5_H
-#define ARM_COMPUTE_NESOBEL5x5_H
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute sobel 5x5 filter. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NESobel5x5HorKernel
- * -# @ref NESobel5x5VertKernel
- *
- */
-class NESobel5x5 : public IFunction
-{
-public:
-    /** Default constructor */
-    NESobel5x5(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_x              (optional) Destination for the Sobel 5x5 convolution along the X axis. Data type supported: S16.
-     * @param[out]     output_y              (optional) Destination for the Sobel 5x5 convolution along the Y axis. Data type supported: S16.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup          _memory_group;   /**< Function memory group */
-    NESobel5x5HorKernel  _sobel_hor;      /**< Sobel Horizontal 5x5 kernel */
-    NESobel5x5VertKernel _sobel_vert;     /**< Sobel Vertical 5x5 kernel */
-    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
-    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
-    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
-};
-}
-#endif /*ARM_COMPUTE_NESOBEL5x5_H */
diff --git a/arm_compute/runtime/NEON/functions/NESobel7x7.h b/arm_compute/runtime/NEON/functions/NESobel7x7.h
deleted file mode 100644
index e9098880f5..0000000000
--- a/arm_compute/runtime/NEON/functions/NESobel7x7.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL7x7_H
-#define ARM_COMPUTE_NESOBEL7x7_H
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstdint>
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute sobel 7x7 filter. This function calls the following NEON kernels:
- *
- * -# @ref NEFillBorderKernel (executed if border_mode == CONSTANT or border_mode == REPLICATE)
- * -# @ref NESobel7x7HorKernel
- * -# @ref NESobel7x7VertKernel
- *
- */
-class NESobel7x7 : public IFunction
-{
-public:
-    /** Default constructor */
-    NESobel7x7(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Initialise the function's source, destinations and border mode.
-     *
-     * @note At least one of output_x or output_y must be not NULL.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output_x              (optional) Destination for the Sobel 7x7 convolution along the X axis. Data type supported: S32.
-     * @param[out]     output_y              (optional) Destination for the Sobel 7x7 convolution along the Y axis. Data type supported: S32.
-     * @param[in]      border_mode           Border mode to use for the convolution.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     *
-     */
-    void configure(ITensor *input, ITensor *output_x, ITensor *output_y, BorderMode border_mode, uint8_t constant_border_value = 0);
-
-    // Inherited methods overridden:
-    void run() override;
-
-protected:
-    MemoryGroup          _memory_group;   /**< Function memory group */
-    NESobel7x7HorKernel  _sobel_hor;      /**< Sobel Horizontal 7x7 kernel */
-    NESobel7x7VertKernel _sobel_vert;     /**< Sobel Vertical 7x7 kernel */
-    Tensor               _tmp_x;          /**< Temporary buffer for Sobel X */
-    Tensor               _tmp_y;          /**< Temporary buffer for Sobel Y */
-    NEFillBorderKernel   _border_handler; /**< Kernel to handle tensor borders */
-};
-}
-#endif /*ARM_COMPUTE_NESOBEL7x7_H */
diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
index b80ceaf25c..1787de6237 100644
--- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,31 +24,18 @@
 #ifndef ARM_COMPUTE_NESOFTMAXLAYER_H
 #define ARM_COMPUTE_NESOFTMAXLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer.
- *
- * Softmax is calculated by :
- * @f[ out = \frac{e^{x - max(x)}}{\sum{e^{x - max(x)}}} @f]
- *
- * Log Softmax is calculated by :
- * @f[ out = (x - max(x)) - \sum{e^{x - max(x)}} @f]
- *
- * This function runs the following kernels:
- * -# @ref NEFillBorderKernel
- * -# @ref NELogits1DMaxKernel
- * -# @ref NELogits1DSoftmaxKernel
- */
+/** Basic function to compute a SoftmaxLayer and a Log SoftmaxLayer. */
 template <bool IS_LOG = false>
 class NESoftmaxLayerGeneric : public IFunction
 {
@@ -58,69 +45,53 @@ public:
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NESoftmaxLayerGeneric(const NESoftmaxLayerGeneric &) = delete;
     /** Default move constructor */
-    NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
+    NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     NESoftmaxLayerGeneric &operator=(const NESoftmaxLayerGeneric &) = delete;
     /** Default move assignment operator */
-    NESoftmaxLayerGeneric &operator=(NESoftmaxLayerGeneric &&) = default;
+    NESoftmaxLayerGeneric &operator=(NESoftmaxLayerGeneric &&);
+    /** Default destructor */
+    ~NESoftmaxLayerGeneric();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |F16            |F16            |
+     * |F32            |F32            |
+     *
      * @param[in,out] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. If the width is not a
-     *                       multiple of the internal processing block size, @ref NEFillBorderKernel replicates the
+     *                       multiple of the internal processing block size, @ref NEFillBorder replicates the
      *                       last value of each row to the nearest multiple.
      * @param[out]    output Destination tensor. Data types supported: same as @p input.
      * @param[in]     beta   (Optional) A scaling factor for the exponent.
-     * @param[in]     axis   (Optional) Reduction axis. Defaults to -1.
-     *                       Negative index is used to specify axis from the end (e.g. -1 for the last axis).Must be in range [-input_num_dimensions, input_num_dimensions).
-     *                       It has the purpose of squashing the first @p axis dimensions together. For instance, given a [4x4x4x4] image,
-     *                       when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in]     axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      */
-    void configure(ITensor *input, ITensor *output, float beta = 1.0f, int32_t axis = -1);
+    void configure(ITensor *input, ITensor *output, float beta = 1.0f, int32_t axis = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref NESoftmaxLayer
      *
      * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] output Destination tensor info. Data types supported: same as @p input
      * @param[in] beta   (Optional) A scaling factor for the exponent.
-     * @param[in] axis   (Optional) Reduction axis. Defaults to -1.
-     *                   Negative index is used to specify axis from the end (e.g. -1 for the last axis).Must be in range [-input_num_dimensions, input_num_dimensions).
-     *                   It has the purpose of squashing the first @p axis dimensions together. For instance, given a [4x4x4x4] image,
-     *                   when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
+     * @param[in] axis   (Optional) The dimension in which to apply the function. E.g. for input of shape 4x5x6 and
+     *                       axis=1, softmax will be applied to 4x6=24 vectors of size 5. Defaults to 0
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = -1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    /** Utility method to configure the kernels needed to flatten the input
-     * tensor.
-     *
-     * @note This function changes the internal state of this class. In particular,
-     * it initializes the kernel @p _flatten_kernel and the tensors @p _input_flat and
-     * @p _output_flat
-     *
-     * @param[in] input  Original source tensor.
-     * @param[in] output Original destination tensor.
-     * @param[in] axis   (Optional) Reduction axis. Defaults to -1.
-     *                   Negative index is used to specify axis from the end (e.g. -1 for the last axis).Must be in range [-input_num_dimensions, input_num_dimensions).
-     *                   It has the purpose of squashing the first @p axis dimensions together. For instance, given a [4x4x4x4] image,
-     *                   when @p axis is 2, the Softmax reduction will be applied on each of the [4x4] planes of the input image.
-     */
-    void configure_reshape_input_kernel(const ITensor *input, const ITensor *output, int32_t axis);
-
-    MemoryGroup                     _memory_group;
-    NELogits1DMaxKernel             _max_kernel;
-    NELogits1DSoftmaxKernel<IS_LOG> _softmax_kernel;
-    std::unique_ptr<INEKernel>      _flat_or_reshape_kernel_ptr;
-    NEFillBorderKernel              _fill_border_kernel;
-    NEReshapeLayerKernel            _reshape_kernel;
-    Tensor                          _max;
-    Tensor                          _tmp;
-    Tensor                          _input_flattened;
-    Tensor                          _output_flattened;
-    bool                            _needs_flattening;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 
 using NESoftmaxLayer    = NESoftmaxLayerGeneric<false>;
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
index 75fa50c1b0..5dee61a4a8 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToBatchLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,19 +24,21 @@
 #ifndef ARM_COMPUTE_NESPACETOBATCHLAYER_H
 #define ARM_COMPUTE_NESPACETOBATCHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NESpaceToBatchLayerKernel;
+class NEFill;
 
-/** Basic function to spatial divide a tensor. This function calls the following NEON kernels/functions:
+/** Basic function to spatial divide a tensor. This function calls the following kernels/functions:
  *
- *  -# @ref NEMemsetKernel
+ *  -# @ref NEFill
  *  -# @ref NESpaceToBatchLayerKernel
  */
 class NESpaceToBatchLayer : public IFunction
@@ -53,12 +55,21 @@ public:
     /** Allow instances of this class to be moved */
     NESpaceToBatchLayer &operator=(NESpaceToBatchLayer &&) = default;
     /** Default destructor */
-    virtual ~NESpaceToBatchLayer() = default;
+    ~NESpaceToBatchLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0      |src1      |src2      |dst       |
+     * |:---------|:---------|:---------|:---------|
+     * |All       |S32       |S32       |All       |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
-     * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-     * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
+     * @param[in]  block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in]  paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output      Tensor output. Data types supported: same as @p input
      */
     void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output);
@@ -67,41 +78,54 @@ public:
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[in]  block_shape_x Block shape x value.
      * @param[in]  block_shape_y Block shape y value.
-     * @param[in]  padding_left  The left padding of the output tensor.
-     * @param[in]  padding_right The right padding of the output tensor.
+     * @param[in]  padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.
-     * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
-     * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
+     * @param[in] block_shape 1-D tensor with shape [M]. Supported M: 2. Data types supported: S32
+     * @param[in] paddings    2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[in] output      Tensor output info. Data types supported: same as @p input
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayer (Static block shape and paddings)
      *
      * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported: All.
      * @param[in] block_shape_x Block shape x value.
      * @param[in] block_shape_y Block shape y value.
-     * @param[in] padding_left  The left padding of the output tensor.
-     * @param[in] padding_right The right padding of the output tensor.
+     * @param[in] padding_left  The padding at the beginning of every dimension of the output tensor.
+     * @param[in] padding_right The padding at the end of every dimension of the output tensor.
      * @param[in] output        Tensor output info. Data types supported: same as @p input
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-    NEMemsetKernel            _memset_kernel;         /**< Memset kernel to run */
-    bool                      _has_padding;           /**< Flag to check if the output has padding */
+    std::unique_ptr<NESpaceToBatchLayerKernel> _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
+    std::unique_ptr<NEFill>                    _fill_f;                /**< Fill function to run */
+    bool                                       _has_padding;           /**< Flag to check if the output has padding */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESPACETOBATCHLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
index 6a7a9c83a1..1820cb8f6b 100644
--- a/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
+++ b/arm_compute/runtime/NEON/functions/NESpaceToDepthLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,20 +24,18 @@
 #ifndef ARM_COMPUTE_NESPACETODEPTHLAYER_H
 #define ARM_COMPUTE_NESPACETODEPTHLAYER_H
 
+#include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
-#include "arm_compute/core/Types.h"
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NESpaceToDepthLayerKernel;
 
-/** This function calls the following NEON kernels/functions:
- *
- *  -# @ref NESpaceToDepthLayerKernel
- */
+/** Basic function to run @ref NESpaceToDepthLayerKernel. */
 class NESpaceToDepthLayer : public IFunction
 {
 public:
@@ -52,9 +50,18 @@ public:
     /** Allow instances of this class to be moved */
     NESpaceToDepthLayer &operator=(NESpaceToDepthLayer &&) = default;
     /** Default destructor */
-    virtual ~NESpaceToDepthLayer() = default;
+    ~NESpaceToDepthLayer();
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported: All.
      * @param[out] output      Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape Block shape value
@@ -74,7 +81,7 @@ public:
     void run() override;
 
 private:
-    NESpaceToDepthLayerKernel _space_to_depth_kernel; /**< SpaceToDepth kernel to run */
+    std::unique_ptr<NESpaceToDepthLayerKernel> _space_to_depth_kernel; /**< SpaceToDepth kernel to run */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NESPACETODEPTHLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NESplit.h b/arm_compute/runtime/NEON/functions/NESplit.h
index 69aef793d5..36358a7094 100644
--- a/arm_compute/runtime/NEON/functions/NESplit.h
+++ b/arm_compute/runtime/NEON/functions/NESplit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-
 #include "arm_compute/runtime/CPP/functions/CPPSplit.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/NEON/functions/NESlice.h"
@@ -40,6 +39,18 @@ namespace arm_compute
 class NESplit : public CPPSplit<NESlice>
 {
 public:
+    /** NESplit
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     */
+
     // Inherited methods overridden:
     void run() override;
 };
diff --git a/arm_compute/runtime/NEON/functions/NEStackLayer.h b/arm_compute/runtime/NEON/functions/NEStackLayer.h
index 9288035060..98dacde0c1 100644
--- a/arm_compute/runtime/NEON/functions/NEStackLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEStackLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,20 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NESTACKLAYER_H
-#define ARM_COMPUTE_NESTACKLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEStackLayerKernel.h"
-
 #include <memory>
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
+class NEStackLayerKernel;
 
 /** Basic function to stack tensors along an axis. This function calls the following kernel:
  *
@@ -46,8 +46,26 @@ class NEStackLayer : public IFunction
 public:
     /** Default constructor */
     NEStackLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStackLayer(const NEStackLayer &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStackLayer &operator=(const NEStackLayer &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEStackLayer(NEStackLayer &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEStackLayer &operator=(NEStackLayer &&) = delete;
+    /** Default destructor */
+    ~NEStackLayer();
     /** Initialise the kernel's inputs vector and output.
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @note Supported input tensor rank: up to 4
      *
      * @param[in]  input  The vectors containing all the tensors with the same shape to stack. Data types supported: All
@@ -73,9 +91,8 @@ public:
     void run() override;
 
 private:
-    std::vector<ITensor *>          _input;
-    std::vector<NEStackLayerKernel> _stack_kernels;
-    unsigned int                    _num_inputs;
+    std::unique_ptr<NEStackLayerKernel> _stack_kernel;
+    bool                                _is_prepared;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NESTACKLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NESTACKLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEStridedSlice.h b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
index 6d5e6392f5..fa1113ffec 100644
--- a/arm_compute/runtime/NEON/functions/NEStridedSlice.h
+++ b/arm_compute/runtime/NEON/functions/NEStridedSlice.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,8 @@
 #ifndef ARM_COMPUTE_NE_STRIDED_SLICE_H
 #define ARM_COMPUTE_NE_STRIDED_SLICE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
@@ -32,11 +33,32 @@ namespace arm_compute
 class ITensor;
 
 /** Basic function to run @ref NEStridedSliceKernel */
-class NEStridedSlice : public INESimpleFunction
+class NEStridedSlice : public IFunction
 {
 public:
+    /** Default Constructor */
+    NEStridedSlice();
+    /** Default Destructor */
+    ~NEStridedSlice();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStridedSlice(const NEStridedSlice &) = delete;
+    /** Default move constructor */
+    NEStridedSlice(NEStridedSlice &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEStridedSlice &operator=(const NEStridedSlice &) = delete;
+    /** Default move assignment operator */
+    NEStridedSlice &operator=(NEStridedSlice &&);
+
     /** Configure kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @note Supported tensor rank: up to 4
      *
      * @param[in]  input            Source tensor. Data type supported: All
@@ -49,9 +71,74 @@ public:
      * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensor *input, ITensor *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    void configure(const ITensor     *input,
+                   ITensor           *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in] input            Source tensor info. Data type supported: All
+     * @param[in] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in] starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in] begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
+};
+
+namespace experimental
+{
+/** Basic function to run @ref NEStridedSliceKernel */
+class NEStridedSlice : public INEOperator
+{
+public:
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  input            Source tensor info. Data type supported: All
+     * @param[out] output           Destination tensor info. Data type supported: Same as @p input
+     * @param[in]  starts           The starts of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  ends             The ends of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  strides          The strides of the dimensions of the input tensor to be sliced. The length must be of rank(input).
+     * @param[in]  begin_mask       (Optional) If the ith bit of begin_mask is set, starts[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  end_mask         (Optional) If the ith bit of end_mask is set, ends[i] is ignored and the fullest possible range in that dimension is used instead.
+     * @param[in]  shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
+     *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
+     */
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask       = 0,
+                   int32_t            end_mask         = 0,
+                   int32_t            shrink_axis_mask = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSlice
      *
@@ -67,9 +154,15 @@ public:
      * @param[in] shrink_axis_mask (Optional) If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask = 0, int32_t end_mask = 0, int32_t shrink_axis_mask = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask       = 0,
+                           int32_t            end_mask         = 0,
+                           int32_t            shrink_axis_mask = 0);
 };
+} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NE_STRIDED_SLICE_H */
diff --git a/arm_compute/runtime/NEON/functions/NETableLookup.h b/arm_compute/runtime/NEON/functions/NETableLookup.h
deleted file mode 100644
index b0685afd5b..0000000000
--- a/arm_compute/runtime/NEON/functions/NETableLookup.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETABLELOOKUP_H
-#define ARM_COMPUTE_NETABLELOOKUP_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-class ILut;
-
-/** Basic function to run @ref NETableLookupKernel */
-class NETableLookup : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the kernel's inputs and output
-     *
-     * @param[in]  input  First tensor input. Data types supported: U8/S16
-     * @param[in]  lut    Input lookup table.
-     * @param[out] output Output tensor. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ILut *lut, ITensor *output);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETABLELOOKUP_H */
diff --git a/arm_compute/runtime/NEON/functions/NEThreshold.h b/arm_compute/runtime/NEON/functions/NEThreshold.h
deleted file mode 100644
index c955283e9e..0000000000
--- a/arm_compute/runtime/NEON/functions/NEThreshold.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETHRESHOLD_H
-#define ARM_COMPUTE_NETHRESHOLD_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEThresholdKernel */
-class NEThreshold : public INESimpleFunctionNoBorder
-{
-public:
-    /** Initialise the function's source, destination, thresholds and threshold type
-     *
-     * @param[in]  input       First tensor input. Data type supported: U8.
-     * @param[out] output      Output tensor. Data type supported: U8.
-     * @param[in]  threshold   Threshold. If upper threshold is specified, this will be used as the lower threshold
-     * @param[in]  false_value Value to assign when the condition is false
-     * @param[in]  true_value  value to assign when the condition is true
-     * @param[in]  type        Thresholding type. Can either be BINARY or RANGE.
-     * @param[in]  upper       Upper threshold. Only used with RANGE thresholding
-     */
-    void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value = 0, uint8_t true_value = 0,
-                   ThresholdType type = ThresholdType::BINARY, uint8_t upper = 0);
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLD_H */
diff --git a/arm_compute/runtime/NEON/functions/NETile.h b/arm_compute/runtime/NEON/functions/NETile.h
index 14d4f221f1..001a0a4128 100644
--- a/arm_compute/runtime/NEON/functions/NETile.h
+++ b/arm_compute/runtime/NEON/functions/NETile.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,13 +24,13 @@
 #ifndef ARM_COMPUTE_NETILE_H
 #define ARM_COMPUTE_NETILE_H
 
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NETileKernel */
 class NETile : public INESimpleFunctionNoBorder
@@ -38,6 +38,14 @@ class NETile : public INESimpleFunctionNoBorder
 public:
     /** Set the source, destination of the kernel
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
      * @param[in]  input     Source tensor. Data type supported: All.
      * @param[out] output    Destination tensor. Same as @p input
      * @param[in]  multiples Contains the number of times the input tensor should be replicated on the given dimension.
diff --git a/arm_compute/runtime/NEON/functions/NETranspose.h b/arm_compute/runtime/NEON/functions/NETranspose.h
index 03c90e5b28..5d2d1f1b01 100644
--- a/arm_compute/runtime/NEON/functions/NETranspose.h
+++ b/arm_compute/runtime/NEON/functions/NETranspose.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,22 +25,42 @@
 #define ARM_COMPUTE_NETRANSPOSE_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+
+#include <memory>
 
 namespace arm_compute
 {
+// Forward declarations
 class ITensor;
+class ITensorInfo;
 
-/** Basic function to transpose a matrix on NEON. This function calls the following NEON kernel:
- *
- *  -# @ref NETransposeKernel
- *
- */
-class NETranspose : public INESimpleFunctionNoBorder
+/** Basic function to run @ref cpu::kernels::CpuTransposeKernel */
+class NETranspose : public IFunction
 {
 public:
+    /** Default Constructor */
+    NETranspose();
+    /** Default Destructor */
+    ~NETranspose();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETranspose(const NETranspose &) = delete;
+    /** Default move constructor */
+    NETranspose(NETranspose &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NETranspose &operator=(const NETranspose &) = delete;
+    /** Default move assignment operator */
+    NETranspose &operator=(NETranspose &&) = default;
     /** Initialise the kernel's inputs and output
      *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src    |dst    |
+     * |:------|:------|
+     * |All    |All    |
+     *
      * @param[in]  input  Input tensor. Data types supported: All
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
@@ -53,7 +73,13 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-
 #endif /* ARM_COMPUTE_NETRANSPOSE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEUnstack.h b/arm_compute/runtime/NEON/functions/NEUnstack.h
index dbb04f08e5..e1af96d08d 100644
--- a/arm_compute/runtime/NEON/functions/NEUnstack.h
+++ b/arm_compute/runtime/NEON/functions/NEUnstack.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,7 +26,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
-
 #include "arm_compute/runtime/NEON/functions/NEStridedSlice.h"
 
 #include <memory>
@@ -45,10 +44,28 @@ class NEUnstack : public IFunction
 public:
     /** Default constructor */
     NEUnstack();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUnstack(const NEUnstack &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEUnstack &operator=(const NEUnstack &) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEUnstack(NEUnstack &&) = delete;
+    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+    NEUnstack &operator=(NEUnstack &&) = delete;
+    /** Default destructor */
+    ~NEUnstack() = default;
     /** Set the input, output and unstacking axis.
      *
-     * @param[in]     input         A tensor to be unstacked. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
-     * @param[in,out] output_vector A vector of tensors. Data types supported: Same as @p input.
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in]     input         A tensor to be unstacked. Data type supported: All.
+     * @param[in,out] output_vector A vector of tensors. Data types supported: same as @p input.
      *                              Note: The number of elements of the vector will be used as the number of slices to be taken from the axis.
      * @param[in]     axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
@@ -56,8 +73,8 @@ public:
     void configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis);
     /** Static function to check if given info will lead to a valid configuration of @ref NEUnstack
      *
-     * @param[in] input         Input tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-     * @param[in] output_vector Vector of output tensors' info. Data types supported: Same as @p input.
+     * @param[in] input         Input tensor info. Data type supported: All.
+     * @param[in] output_vector Vector of output tensors' info. Data types supported: same as @p input.
      * @param[in] axis          The axis to unstack along. Valid values are [-R,R) where R is the input's rank. Negative values wrap around.
      *
      * @return a status
diff --git a/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h b/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
deleted file mode 100644
index ff465e54a0..0000000000
--- a/arm_compute/runtime/NEON/functions/NEUpsampleLayer.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2018-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEUPSAMPLELAYER_H
-#define ARM_COMPUTE_NEUPSAMPLELAYER_H
-
-#include "arm_compute/core/NEON/kernels/NEUpsampleLayerKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Tensor.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Function to run upsample layer */
-class NEUpsampleLayer : public IFunction
-{
-public:
-    /** Constructor */
-    NEUpsampleLayer();
-    /** Set the input output tensors.
-     *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[out] output Destination tensor. Data types supported: same as @p input.
-     * @param[in]  info   Contains stride information described in @ref Size2D.
-     * @param[in]  policy Defines the policy to fill the intermediate pixels.
-     *
-     */
-    void configure(const ITensor *input, ITensor *output, const Size2D &info,
-                   const InterpolationPolicy &policy);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEUpsampleLayer
-     *
-     * @param[in]  input  Source tensor info. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
-     * @param[out] output Destination tensor info. Data types supported: same as @p input.
-     * @param[in]  info   Contains stride information described in @ref Size2D.
-     * @param[in]  policy Defines the policy to fill the intermediate pixels.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &info,
-                           const InterpolationPolicy &policy);
-
-    // Inherited methods overridden:
-    void run() override;
-
-private:
-    NEUpsampleLayerKernel _kernel;
-    DataLayout            _data_layout;
-};
-} // arm_compute
-#endif /* ARM_COMPUTE_NEUPSAMPLELAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpAffine.h b/arm_compute/runtime/NEON/functions/NEWarpAffine.h
deleted file mode 100644
index 768ef0c6d3..0000000000
--- a/arm_compute/runtime/NEON/functions/NEWarpAffine.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWARPAFFINE_H
-#define ARM_COMPUTE_NEWARPAFFINE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEWarpAffineKernel */
-class NEWarpAffine : public INESimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: U8
-     * @param[in]      matrix                The perspective matrix. Must be 2x3 of type float.
-     *                                       The matrix argument requires 9 values, the last 3 values are ignored.
-     * @param[in]      policy                The interpolation type.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEWARPAFFINE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h b/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
deleted file mode 100644
index 66fb9acc3f..0000000000
--- a/arm_compute/runtime/NEON/functions/NEWarpPerspective.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWARPPERSPECTIVE_H
-#define ARM_COMPUTE_NEWARPPERSPECTIVE_H
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEWarpPerspectiveKernel */
-class NEWarpPerspective : public INESimpleFunction
-{
-public:
-    /** Initialize the function's source, destination, interpolation policy and border_mode.
-     *
-     * @param[in, out] input                 Source tensor. Data type supported: U8. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: U8
-     * @param[in]      matrix                The perspective matrix. Must be 3x3 of type float.
-     * @param[in]      policy                The interpolation type.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     */
-    void configure(ITensor *input, ITensor *output, const std::array<float, 9> &matrix, InterpolationPolicy policy, BorderMode border_mode, uint8_t constant_border_value = 0);
-};
-}
-#endif /*ARM_COMPUTE_NEWARPPERSPECTIVE_H */
diff --git a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
index 4a8fe61614..6caa2aeb59 100644
--- a/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,18 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
-#define ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
+#define ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/CPP/functions/CPPPermute.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
@@ -42,11 +36,11 @@ namespace arm_compute
 // Forward declarations
 class ITensor;
 
-/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
- * -# @ref NEWinogradLayerTransformWeightsKernel (executed only once in the first call to the run() method )
- * -# @ref NEWinogradLayerTransformInputKernel
- * -# @ref NEWinogradLayerTransformOutputKernel
- * -# @ref NEGEMMAssemblyDispatch
+/** Basic function to simulate a convolution layer. This function calls the following kernels:
+ *
+ * -# @ref cpu::CpuWinogradConv2dTransformInputKernel
+ * -# @ref cpu::CpuWinogradConv2dTransformOutputKernel
+ * -# @ref cpu::CpuGemmAssemblyDispatch
  * -# @ref CPPPermute (three times: weights, input and output)
  *
  * @note  Some Winograd configurations (i.e. F(2x2, 5x5), F(4x4, 5x5)) are supported only with enable_fast_math = true
@@ -56,14 +50,35 @@ class NEWinogradConvolutionLayer : public IFunction
 public:
     /** Constructor */
     NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
+    /** Default move constructor */
+    NEWinogradConvolutionLayer(NEWinogradConvolutionLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
+    /** Default move assignment operator */
+    NEWinogradConvolutionLayer &operator=(NEWinogradConvolutionLayer &&) = default;
+    /** Destructor */
+    ~NEWinogradConvolutionLayer();
 
     /** Set the input and output tensors.
      *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     *
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: F16/F32.
      * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     *                              Currently only 3x3 and 5x5 kernels are supported.
+     *                              Supported kernel sizes: (height, width) -> 3x3, 1x3, 3x1, 5x5, 1x5, 5x1 for Fp32
+     *                              -> 3x3 for Fp16
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
@@ -72,62 +87,35 @@ public:
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                   bool enable_fast_math = false);
+    void configure(const ITensor             *input,
+                   const ITensor             *weights,
+                   const ITensor             *biases,
+                   ITensor                   *output,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
     void run() override;
     void prepare() override;
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
+    /** Static function to check if given info will lead to a valid configuration of @ref NEWinogradConvolutionLayer
      *
-     * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                             while every optional dimension from 4 and above represent a batch of inputs.
-     *                             Data types supported: F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
-     *                             Currently only 3x3 and 5x5 kernels are supported.
-     * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                             Data types supported: Same as @p input.
-     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo. Currently only unit strides are supported.
-     * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                              available which may introduce a drop of accuracy as well. Default is false
+     * Similar to @ref NEWinogradConvolutionLayer::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
-
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradConvolutionLayer(const NEWinogradConvolutionLayer &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEWinogradConvolutionLayer &operator=(const NEWinogradConvolutionLayer &) = delete;
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
 private:
-    MemoryGroup                _memory_group;
-    NEGEMM                     _gemm_function;
-    std::unique_ptr<INEKernel> _transform_input_kernel;
-    std::unique_ptr<INEKernel> _transform_output_kernel;
-    std::unique_ptr<INEKernel> _transform_weights_kernel;
-    NEActivationLayer          _activationlayer_function;
-
-    CPPPermute     _permute_input;
-    CPPPermute     _permute_weights;
-    CPPPermute     _permute_output;
-    Tensor         _input_transformed;
-    Tensor         _output_transformed;
-    Tensor         _input_workspace;
-    Tensor         _output_workspace;
-    Tensor         _kernel_storage;
-    Tensor         _input_nhwc;
-    Tensor         _output_nhwc;
-    Tensor         _weights_hwio;
-    const ITensor *_input;
-    const ITensor *_weights;
-    ITensor       *_output;
-    bool           _is_prepared;
-    bool           _is_activationlayer_enabled;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEWINOGRADCONVOLUTIONLAYER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_NEON_FUNCTIONS_NEWINOGRADCONVOLUTIONLAYER_H
diff --git a/arm_compute/runtime/NEON/functions/NEYOLOLayer.h b/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
deleted file mode 100644
index 5e0c34b9b1..0000000000
--- a/arm_compute/runtime/NEON/functions/NEYOLOLayer.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEYOLOLAYER_H
-#define ARM_COMPUTE_NEYOLOLAYER_H
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/NEON/kernels/NEYOLOLayerKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEYOLOLayerKernel */
-class NEYOLOLayer : public INESimpleFunctionNoBorder
-{
-public:
-    /** Set the input and output tensor.
-     *
-     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
-     *
-     * @param[in, out] input       Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                             of the activation function. Data types supported: F16/F32.
-     * @param[out]     output      Destination tensor. Data type supported: same as @p input
-     * @param[in]      act_info    Activation layer parameters.
-     * @param[in]      num_classes Number of classes to activate (must be submultiple of @p input channels)
-     */
-    void configure(ITensor *input, ITensor *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEYOLOLayer
-     *
-     * @param[in] input       Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                        of the activation function. Data types supported: F16/F32.
-     * @param[in] output      Destination tensor info. Data type supported: same as @p input
-     * @param[in] act_info    Activation layer information.
-     * @param[in] num_classes Number of classes to activate (must be submultiple of @p input channels)
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info, int32_t num_classes);
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEYOLOLAYER_H */
diff --git a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
deleted file mode 100644
index f16bb46d35..0000000000
--- a/arm_compute/runtime/NEON/functions/assembly/NEDepthwiseConvolutionAssemblyDispatch.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-#define ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
-
-namespace arm_compute
-{
-/** Depthwise convolution assembly kernel glue */
-class NEDepthwiseConvolutionAssemblyDispatch : public IFunction
-{
-public:
-    /** Default constructor
-     *
-     * @param[in,out] memory_manager Memory manager to use
-     */
-    NEDepthwiseConvolutionAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyDispatch(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    NEDepthwiseConvolutionAssemblyDispatch(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDepthwiseConvolutionAssemblyDispatch &operator=(const NEDepthwiseConvolutionAssemblyDispatch &) = delete;
-    /** Default move assignment operator */
-    NEDepthwiseConvolutionAssemblyDispatch &operator=(NEDepthwiseConvolutionAssemblyDispatch &&) = default;
-    /** Default destructor */
-    ~NEDepthwiseConvolutionAssemblyDispatch();
-    /** Initialize the function's source, destination, kernels and border_size.
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     */
-    void configure(const ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
-                   const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                   const Size2D &dilation = Size2D(1, 1));
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionAssemblyDispatch
-     *
-     * @note Supports only NHWC format
-     *
-     * @param[in]  input            Source tensor. Data type supported: QASYMM8/F16/F32. (Written to only for border filling).
-     * @param[in]  weights          Weights tensor. These are 3D tensors with shape [W, H, IFM]. Data type supported: Same as @p input.
-     * @param[in]  bias             (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
-     *                              Data type supported: Same as @p input.
-     * @param[out] output           Destination tensor. Data type supported: same as @p input.
-     * @param[in]  conv_info        Padding and stride information to use for the convolution.
-     * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
-     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
-     * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return An error status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output,
-                           const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           const Size2D &dilation = Size2D(1, 1));
-    /** Check if the optimized kernel can be used for the given kernel sizes and strides
-     *
-     * @warning Even if this return true the inputs and outputs might need to get permuted as the only layout supported is NHWC
-     *
-     * @param[in] input            Input tensor info.
-     * @param[in] weights          Weights tensor info.
-     * @param[in] conv_info        Convolution layer metadata.
-     * @param[in] depth_multiplier (Optional) Depth multiplier to be used.
-     * @param[in] dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     *
-     * @return True if the assembly kernel could be used else false. Note that transformations of input/output could be needed.
-     */
-    static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, PadStrideInfo conv_info, unsigned int depth_multiplier = 1, const Size2D &dilation = Size2D(1, 1));
-
-    // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
-
-private:
-    struct LocalImpl;
-
-private:
-    MemoryGroup                _memory_group;
-    const ITensor             *_input;
-    const ITensor             *_weights;
-    const ITensor             *_bias;
-    ITensor                   *_output;
-    Tensor                     _packed_weights;
-    Tensor                     _workspace;
-    bool                       _is_prepared;
-    std::unique_ptr<LocalImpl> _pImpl;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDEPTHWISECONVOLUTIONASSEMBLYDISPATCH_H */
diff --git a/arm_compute/runtime/OMP/OMPScheduler.h b/arm_compute/runtime/OMP/OMPScheduler.h
index ed00833a9c..c718e74359 100644
--- a/arm_compute/runtime/OMP/OMPScheduler.h
+++ b/arm_compute/runtime/OMP/OMPScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_OMPSCHEDULER_H
-#define ARM_COMPUTE_OMPSCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
 
 #include "arm_compute/runtime/IScheduler.h"
 
@@ -55,6 +55,19 @@ public:
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
 
+    /** Multithread the execution of the passed kernel if possible.
+     *
+     * The kernel will run on a single thread if any of these conditions is true:
+     * - ICPPKernel::is_parallelisable() returns false
+     * - The scheduler has been initialized with only one thread.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] window  Window to use for kernel execution.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
+
 protected:
     /** Execute all the passed workloads
      *
@@ -66,6 +79,8 @@ protected:
 
 private:
     unsigned int _num_threads;
+    bool         _has_lmb;
+    unsigned int _nonlittle_num_cpus;
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_OMPSCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_OMP_OMPSCHEDULER_H
diff --git a/arm_compute/runtime/OffsetLifetimeManager.h b/arm_compute/runtime/OffsetLifetimeManager.h
index 26ade0129b..13ebb9fbe3 100644
--- a/arm_compute/runtime/OffsetLifetimeManager.h
+++ b/arm_compute/runtime/OffsetLifetimeManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_OFFSETLIFETIMEMANAGER_H
 
 #include "arm_compute/runtime/ISimpleLifetimeManager.h"
-
 #include "arm_compute/runtime/Types.h"
 
 #include <map>
@@ -62,7 +61,7 @@ public:
 
     // Inherited methods overridden:
     std::unique_ptr<IMemoryPool> create_pool(IAllocator *allocator) override;
-    MappingType mapping_type() const override;
+    MappingType                  mapping_type() const override;
 
 private:
     // Inherited methods overridden:
diff --git a/arm_compute/runtime/OffsetMemoryPool.h b/arm_compute/runtime/OffsetMemoryPool.h
index dff3f23af3..7250194f85 100644
--- a/arm_compute/runtime/OffsetMemoryPool.h
+++ b/arm_compute/runtime/OffsetMemoryPool.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_OFFSETMEMORYPOOL_H
 
 #include "arm_compute/runtime/IMemoryPool.h"
-
 #include "arm_compute/runtime/IMemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
@@ -65,8 +64,8 @@ public:
     const BlobInfo &info() const;
 
     // Inherited methods overridden:
-    void acquire(MemoryMappings &handles) override;
-    void release(MemoryMappings &handles) override;
+    void                         acquire(MemoryMappings &handles) override;
+    void                         release(MemoryMappings &handles) override;
     MappingType                  mapping_type() const override;
     std::unique_ptr<IMemoryPool> duplicate() override;
 
diff --git a/arm_compute/runtime/OperatorList.h b/arm_compute/runtime/OperatorList.h
new file mode 100644
index 0000000000..8bcdf5d3bf
--- /dev/null
+++ b/arm_compute/runtime/OperatorList.h
@@ -0,0 +1,1052 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
+#define ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
+
+/** ActivationLayer
+ *
+ * Description:
+ * Function to simulate an activation layer with the specified activation function.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_ELU
+ * ANEURALNETWORKS_HARD_SWISH
+ * ANEURALNETWORKS_LOGISTIC
+ * ANEURALNETWORKS_RELU
+ * ANEURALNETWORKS_RELU1
+ * ANEURALNETWORKS_RELU6
+ * ANEURALNETWORKS_TANH
+ *
+ */
+
+/** AddMulAdd
+ *
+ * Description:
+ * Performs a fused Add + Mul + Add [+ Relu-based-Activation] operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** ArgMinMaxLayer
+ *
+ * Description:
+ * Function to calculate the index of the minimum or maximum values in a tensor based on an axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_ARGMAX
+ * ANEURALNETWORKS_ARGMIN
+ *
+ */
+
+/** ArithmeticAddition
+ *
+ * Description:
+ * Function to add 2 tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_ADD
+ *
+ */
+
+/** ArithmeticSubtraction
+ *
+ * Description:
+ * Function to substract 2 tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_SUB
+ *
+ */
+
+/** BatchNormalizationLayer
+ *
+ * Description:
+ * Function to perform batch normalization.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** BatchToSpaceLayer
+ *
+ * Description:
+ * Batch to space transformation.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_BATCH_TO_SPACE_ND
+ *
+ */
+
+/** BitwiseAnd
+ *
+ * Description:
+ * Function to perform bitwise AND between 2 tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_LOGICAL_AND
+ *
+ */
+
+/** BitwiseNot
+ *
+ * Description:
+ * Function to perform bitwise NOT.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_LOGICAL_NOT
+ *
+ */
+
+/** BitwiseOr
+ *
+ * Description:
+ * Function to perform bitwise OR between 2 tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_LOGICAL_OR
+ *
+ */
+
+/** BitwiseXor
+ *
+ * Description:
+ * Function to perform bitwise XOR between 2 tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** BoundingBoxTransform
+ *
+ * Description:
+ * Transform proposal bounding boxes to target bounding box using bounding box deltas.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** Cast
+ *
+ * Description:
+ * Function to cast a tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CAST
+ *
+ */
+
+/** ChannelShuffleLayer
+ *
+ * Description:
+ * Function to shuffle the channels of the input tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CHANNEL_SHUFFLE
+ *
+ */
+
+/** Comparison
+ *
+ * Description:
+ * Function to compare 2 tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_EQUAL
+ * ANEURALNETWORKS_GREATER
+ * ANEURALNETWORKS_GREATER_EQUAL
+ * ANEURALNETWORKS_LESS
+ * ANEURALNETWORKS_LESS_EQUAL
+ * ANEURALNETWORKS_NOT_EQUAL
+ *
+ */
+
+/** ConcatenateLayer
+ *
+ * Description:
+ * Function to concatenate tensors along a given axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONCATENATION
+ *
+ */
+
+/** ConvertFullyConnectedWeights
+ *
+ * Description:
+ * Function to transpose the weights for the fully connected layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** ConvolutionLayer
+ *
+ * Description:
+ * Function to compute a convolution layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_2D
+ *
+ */
+
+/** Conv3D
+ *
+ * Description:
+ * Function to compute a 3d convolution layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_3D
+ *
+ */
+
+/** Copy
+ *
+ * Description:
+ * Function to copy a tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** Crop
+ *
+ * Description:
+ * Performs a copy of input tensor to the output tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** CropResize
+ *
+ * Description:
+ * Function to perform cropping and resizing.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** DeconvolutionLayer
+ *
+ * Description:
+ * Function to compute a deconvolution or transpose convolution.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TRANSPOSE_CONV_2D
+ *
+ */
+
+/** DeconvolutionLayerUpsample
+ *
+ * Description:
+ * Function to execute deconvolution upsample on OpenCL.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TRANSPOSE_CONV_2D
+ *
+ */
+
+/** DepthConvertLayer
+ *
+ * Description:
+ * Performs a down-scaling depth conversion.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** DepthToSpaceLayer
+ *
+ * Description:
+ * Depth to Space transformation.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_DEPTH_TO_SPACE
+ *
+ */
+
+/** DepthwiseConvolutionLayer
+ *
+ * Description:
+ * Function to perform depthwise separable convolution.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_DEPTHWISE_CONV_2D
+ *
+ */
+
+/** DequantizationLayer
+ *
+ * Description:
+ * Function to dequantize the values in a tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_DEQUANTIZE
+ *
+ */
+
+/** DetectionPostProcessLayer
+ *
+ * Description:
+ * Function to generate the detection output based on center size encoded boxes, class prediction and anchors by doing non maximum suppression (NMS).
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_DETECTION_POSTPROCESSING
+ *
+ */
+
+/** DirectConvolutionLayer
+ *
+ * Description:
+ * Function to compute direct convolution.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_2D
+ *
+ */
+
+/** DirectDeconvolutionLayer
+ *
+ * Description:
+ * Function to run the deconvolution layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TRANSPOSE_CONV_2D
+ *
+ */
+
+/** ElementwiseOperations
+ *
+ * Description:
+ * Function to perform in Cpu:
+ * - Div
+ * - Max
+ * - Min
+ * - Pow
+ * - SquaredDiff
+ * - Comparisons (Equal, greater, greater_equal, less, less_equal, not_equal)
+ * Function to perform in CL:
+ * - Add
+ * - Sub
+ * - Div
+ * - Max
+ * - Min
+ * - Pow
+ * - SquaredDiff
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_MAXIMUM
+ * ANEURALNETWORKS_MINIMUM
+ * ANEURALNETWORKS_POW
+ * ANEURALNETWORKS_DIV
+ * ANEURALNETWORKS_ADD
+ * ANEURALNETWORKS_SUB
+ * ANEURALNETWORKS_EQUAL
+ * ANEURALNETWORKS_GREATER
+ * ANEURALNETWORKS_GREATER_EQUAL
+ * ANEURALNETWORKS_LESS
+ * ANEURALNETWORKS_LESS_EQUAL
+ * ANEURALNETWORKS_NOT_EQUAL
+ *
+ */
+
+/** ElementwiseUnaryLayer
+ *
+ * Description:
+ * Function to perform:
+ * - Rsqrt
+ * - Exp
+ * - Neg
+ * - Log
+ * - Abs
+ * - Round
+ * - Sin
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_ABS
+ * ANEURALNETWORKS_EXP
+ * ANEURALNETWORKS_LOG
+ * ANEURALNETWORKS_NEG
+ * ANEURALNETWORKS_RSQRT
+ * ANEURALNETWORKS_SIN
+ *
+ */
+
+/** FFT1D
+ *
+ * Description:
+ * Fast Fourier Transform 1D.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** FFT2D
+ *
+ * Description:
+ * Fast Fourier Transform 2D.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** FFTConvolutionLayer
+ *
+ * Description:
+ * Fast Fourier Transform Convolution.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_2D
+ *
+ */
+
+/** Fill
+ *
+ * Description:
+ * Set the values of a tensor with a given value.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_FILL
+ *
+ */
+
+/** FillBorder
+ *
+ * Description:
+ * Function to fill the borders within the XY-planes.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** FlattenLayer
+ *
+ * Description:
+ * Reshape a tensor to be 1D
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_RESHAPE
+ *
+ */
+
+/** Floor
+ *
+ * Description:
+ * Round the value to the lowest number.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_FLOOR
+ *
+ */
+
+/** FullyConnectedLayer
+ *
+ * Description:
+ * Function to perform a fully connected / dense layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_FULLY_CONNECTED
+ *
+ */
+
+/** FuseBatchNormalization
+ *
+ * Description:
+ * Function to fuse the batch normalization node to a preceding convolution node.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** Gather
+ *
+ * Description:
+ * Performs the Gather operation along the chosen axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_GATHER
+ *
+ */
+
+/** GEMM
+ *
+ * Description:
+ * General Matrix Multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** GEMMConv2d
+ *
+ * Description:
+ * General Matrix Multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_2D
+ *
+ */
+
+/** GEMMConvolutionLayer
+ *
+ * Description:
+ * General Matrix Multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_2D
+ *
+ */
+
+/** GEMMDeconvolutionLayer
+ *
+ * Description:
+ * General Matrix Multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TRANSPOSE_CONV_2D
+ *
+ */
+
+/** GEMMLowpMatrixMultiplyCore
+ *
+ * Description:
+ * General Matrix Multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** GEMMLowpOutputStage
+ *
+ * Description:
+ * General Matrix Multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** GenerateProposalsLayer
+ *
+ * Description:
+ * Function to generate proposals for a RPN (Region Proposal Network).
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_GENERATE_PROPOSALS
+ *
+ */
+
+/** InstanceNormalizationLayer
+ *
+ * Description:
+ * Function to perform a Instance normalization on a given axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_INSTANCE_NORMALIZATION
+ *
+ */
+
+/** L2NormalizeLayer
+ *
+ * Description:
+ * Function to perform a L2 normalization on a given axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_L2_NORMALIZATION
+ *
+ */
+
+/** Logical
+ *
+ * Description:
+ * Function to perform:
+ * - Logical AND
+ * - Logical OR
+ * - Logical NOT
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** LogicalAnd
+ *
+ * Description:
+ * Function to perform Logical AND.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** LogicalOr
+ *
+ * Description:
+ * Function to perform Logical OR.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** LogicalNot
+ *
+ * Description:
+ * Function to perform Logical NOT.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** LSTMLayer
+ *
+ * Description:
+ * Function to perform a single time step in a Long Short-Term Memory (LSTM) layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_LSTM
+ *
+ */
+
+/** LSTMLayerQuantized
+ *
+ * Description:
+ * Function to perform quantized LSTM (Long Short-Term Memory)
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_QUANTIZED_LSTM
+ * ANEURALNETWORKS_QUANTIZED_16BIT_LSTM
+ *
+ */
+
+/** MatMul
+ *
+ * Description:
+ * Computes a matrix multiplication in batches.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_BATCH_MATMUL
+ *
+ */
+
+/** MaxUnpoolingLayer
+ *
+ * Description:
+ * Function to perform MaxUnpooling.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** MeanStdDevNormalizationLayer
+ *
+ * Description:
+ * Function to execute mean and standard deviation normalization.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** NormalizationLayer
+ *
+ * Description:
+ * Function to compute normalization layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION
+ *
+ */
+
+/** NormalizePlanarYUVLayer
+ *
+ * Description:
+ * Function to compute normalization planar YUV layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** PadLayer
+ *
+ * Description:
+ * Function to pad a tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_PAD
+ * ANEURALNETWORKS_PAD_V2
+ *
+ */
+
+/** Permute
+ *
+ * Description:
+ * Function to transpose an ND tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TRANSPOSE
+ *
+ */
+
+/** PixelWiseMultiplication
+ *
+ * Description:
+ * Function to perform a multiplication.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_MUL
+ *
+ */
+
+/** PoolingLayer
+ *
+ * Description:
+ * Function to perform pooling with the specified pooling operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_AVERAGE_POOL_2D
+ * ANEURALNETWORKS_L2_POOL_2D
+ * ANEURALNETWORKS_MAX_POOL_2D
+ *
+ */
+
+/** Pooling3dLayer
+ *
+ * Description:
+ * Function to perform pooling 3D with the specified pooling operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * N/A
+ *
+ */
+
+/** PReluLayer
+ *
+ * Description:
+ * Function to compute the activation layer with the PRELU activation function.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_PRELU
+ *
+ */
+
+/** PriorBoxLayer
+ *
+ * Description:
+ * Function to compute prior boxes and clip.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** QLSTMLayer
+ *
+ * Description:
+ * Function to perform quantized LSTM (Long Short-Term Memory).
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_QUANTIZED_LSTM
+ * ANEURALNETWORKS_QUANTIZED_16BIT_LSTM
+ *
+ */
+
+/** QuantizationLayer
+ *
+ * Description:
+ * Function to perform quantization layer
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_QUANTIZE
+ *
+ */
+
+/** Range
+ *
+ * Description:
+ * Function to generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** ReduceMean
+ *
+ * Description:
+ * Function to perform reduce mean operation.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_MEAN
+ *
+ */
+
+/** ReductionOperation
+ *
+ * Description:
+ * Function to perform reduce with the following operations
+ * - ARG_IDX_MAX: Index of the max value
+ * - ARG_IDX_MIN: Index of the min value
+ * - MEAN_SUM:    Mean of sum
+ * - PROD:        Product
+ * - SUM_SQUARE:  Sum of squares
+ * - SUM:         Sum
+ * - MIN:         Min
+ * - MAX:         Max
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_REDUCE_ALL
+ * ANEURALNETWORKS_REDUCE_ANY
+ * ANEURALNETWORKS_REDUCE_MAX
+ * ANEURALNETWORKS_REDUCE_MIN
+ * ANEURALNETWORKS_REDUCE_PROD
+ * ANEURALNETWORKS_REDUCE_SUM
+ *
+ */
+
+/** ReorderLayer
+ *
+ * Description:
+ * Reorders a tensor to a different weights format.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** ReorgLayer
+ *
+ * Description:
+ * Performs a reorganization layer of input tensor to the output tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** ReshapeLayer
+ *
+ * Description:
+ * Function to reshape a tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_RESHAPE
+ * ANEURALNETWORKS_SQUEEZE
+ *
+ */
+
+/** Reverse
+ *
+ * Description:
+ * Function to reverse tensor according to axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** RNNLayer
+ *
+ * Description:
+ * Function to perform recurrent neural network layer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_RNN
+ *
+ */
+
+/** ROIAlignLayer
+ *
+ * Description:
+ * Function to perform ROI alignment.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_ROI_ALIGN
+ *
+ */
+
+/** ROIPoolingLayer
+ *
+ * Description:
+ * Function to perform ROI pooling.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_ROI_POOLING
+ *
+ */
+
+/** Scale
+ *
+ * Description:
+ * Function to perform resize a tensor using to interpolate:
+ * - Bilinear
+ * - Nearest neighbor
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_RESIZE_BILINEAR
+ * ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR
+ *
+ */
+
+/** Select
+ *
+ * Description:
+ * Function to select values from 2 tensors depending on an input tensor of booleans.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_SELECT
+ *
+ */
+
+/** Slice
+ *
+ * Description:
+ * Function to perform tensor slicing.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_SLICE
+ *
+ */
+
+/** SoftmaxLayer
+ *
+ * Description:
+ * Function to compute a SoftmaxLayer and a Log SoftmaxLayer.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_LOG_SOFTMAX
+ * ANEURALNETWORKS_SOFTMAX
+ *
+ */
+
+/** SpaceToBatchLayer
+ *
+ * Description:
+ * Function to divide a tensor spatially.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_SPACE_TO_BATCH_ND
+ *
+ */
+
+/** SpaceToDepthLayer
+ *
+ * Description:
+ * Function to rearrange blocks of spatial data into depth.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_SPACE_TO_DEPTH
+ *
+ */
+
+/** Split
+ *
+ * Description:
+ * Function to split a tensor along a given axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_SPLIT
+ *
+ */
+
+/** StackLayer
+ *
+ * Description:
+ * Function to stack tensors along an axis.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** StridedSlice
+ *
+ * Description:
+ * Function to extract a strided slice of a tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_STRIDED_SLICE
+ *
+ */
+
+/** Tile
+ *
+ * Description:
+ * Function to construct a tensor by tiling a given tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TILE
+ *
+ */
+
+/** Transpose
+ *
+ * Description:
+ * Function to transpose a 2D tensor.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_TRANSPOSE
+ *
+ */
+
+/** Unstack
+ *
+ * Description:
+ * Function to unpack a rank-R tensor into rank-(R-1) tensors.
+ *
+ * Equivalent Android NNAPI Op:
+ * n/a
+ *
+ */
+
+/** WinogradConvolutionLayer
+ *
+ * Description:
+ * Function to do Winograd Convolution.
+ *
+ * Equivalent Android NNAPI Op:
+ * ANEURALNETWORKS_CONV_2D
+ *
+ */
+
+#endif // ACL_ARM_COMPUTE_RUNTIME_OPERATORLIST_H
diff --git a/arm_compute/runtime/OperatorTensor.h b/arm_compute/runtime/OperatorTensor.h
new file mode 100644
index 0000000000..237585bec2
--- /dev/null
+++ b/arm_compute/runtime/OperatorTensor.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_OPERATORTENSOR_H
+#define ARM_COMPUTE_OPERATORTENSOR_H
+
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/experimental/Types.h"
+#include "arm_compute/runtime/Types.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+class TensorInfo;
+class IRuntimeContext;
+class IMemory;
+namespace experimental
+{
+/** Basic implementation of the tensor interface */
+class OperatorTensor : public ITensor
+{
+public:
+    /** Constructor
+     *
+     * @param[in] info   Pointer to the tensor info.
+     * @param[in] memory Pointer to the memory info.
+     *
+     */
+    OperatorTensor(ITensorInfo *info, IMemory *memory);
+    /** Destructor: free the tensor's memory */
+    ~OperatorTensor() = default;
+    /** Allow instances of this class to be move constructed */
+    OperatorTensor(OperatorTensor &&) = default;
+    /** Allow instances of this class to be moved */
+    OperatorTensor &operator=(OperatorTensor &&) = default;
+    /** Prevent instances of this class to be copy assigned */
+    OperatorTensor &operator=(const OperatorTensor &) = delete;
+    /** Prevent instances of this class to be copy constructed */
+    OperatorTensor(const OperatorTensor &) = delete;
+
+    // Inherited methods overridden:
+    arm_compute::ITensorInfo *info() const override;
+    arm_compute::ITensorInfo *info() override;
+    uint8_t                  *buffer() const override;
+
+private:
+    arm_compute::ITensorInfo *_info;
+    arm_compute::IMemory     *_memory;
+    MemoryType                _mem_type;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_OPERATORTENSOR_H */
diff --git a/arm_compute/runtime/PoolManager.h b/arm_compute/runtime/PoolManager.h
index 2ee00bceab..6aa6aef6e2 100644
--- a/arm_compute/runtime/PoolManager.h
+++ b/arm_compute/runtime/PoolManager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_POOLMANAGER_H
 #define ARM_COMPUTE_POOLMANAGER_H
 
-#include "arm_compute/runtime/IPoolManager.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/IMemoryPool.h"
+#include "arm_compute/runtime/IPoolManager.h"
+
 #include "support/Mutex.h"
 #include "support/Semaphore.h"
 
@@ -53,9 +53,9 @@ public:
     PoolManager &operator=(PoolManager &&) = delete;
 
     // Inherited methods overridden:
-    IMemoryPool *lock_pool() override;
-    void unlock_pool(IMemoryPool *pool) override;
-    void register_pool(std::unique_ptr<IMemoryPool> pool) override;
+    IMemoryPool                 *lock_pool() override;
+    void                         unlock_pool(IMemoryPool *pool) override;
+    void                         register_pool(std::unique_ptr<IMemoryPool> pool) override;
     std::unique_ptr<IMemoryPool> release_pool() override;
     void                         clear_pools() override;
     size_t                       num_pools() const override;
@@ -66,5 +66,5 @@ private:
     std::unique_ptr<arm_compute::Semaphore> _sem;            /**< Semaphore to control the queues */
     mutable arm_compute::Mutex              _mtx;            /**< Mutex to control access to the queues */
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_POOLMANAGER_H */
diff --git a/arm_compute/runtime/Pyramid.h b/arm_compute/runtime/Pyramid.h
deleted file mode 100644
index 3dc7259a71..0000000000
--- a/arm_compute/runtime/Pyramid.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Copyright (c) 2016-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_PYRAMID_H
-#define ARM_COMPUTE_PYRAMID_H
-
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/PyramidInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <cstddef>
-#include <memory>
-
-namespace arm_compute
-{
-class Tensor;
-
-/** Basic implementation of the pyramid interface */
-class Pyramid : public IPyramid
-{
-public:
-    /** Initialize pyramid data-object using the given Pyramid's metadata
-     *
-     * @param[in] info Pyramid's metadata
-     */
-    void init(const PyramidInfo &info);
-
-    /** Initialize pyramid data-object using the given Pyramid's metadata
-     *
-     * @note Uses conservative padding strategy which fits all kernels.
-     *
-     * @param[in] info Pyramid's metadata
-     */
-    void init_auto_padding(const PyramidInfo &info);
-
-    /** Allocate the planes in the pyramid */
-    void allocate();
-
-    // Inherited method overridden
-    const PyramidInfo *info() const override;
-    Tensor *get_pyramid_level(size_t index) const override;
-
-private:
-    /** Initialize pyramid data-object using the given Pyramid's metadata
-     *
-     * @param[in] info         Pyramid's metadata
-     * @param[in] auto_padding Specifies whether the image in the pyramid use auto padding
-     */
-    void internal_init(const PyramidInfo &info, bool auto_padding);
-
-    PyramidInfo                 _info{};
-    mutable std::vector<Tensor> _pyramid{};
-};
-}
-#endif /*ARM_COMPUTE_PYRAMID_H */
diff --git a/arm_compute/runtime/RuntimeContext.h b/arm_compute/runtime/RuntimeContext.h
index 361d601700..d64e609196 100644
--- a/arm_compute/runtime/RuntimeContext.h
+++ b/arm_compute/runtime/RuntimeContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019, 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_RUNTIME_CONTEXT_H
 #define ARM_COMPUTE_RUNTIME_CONTEXT_H
 
-#include "arm_compute/runtime/DeviceProperties.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
 
 #include <memory>
@@ -51,14 +50,12 @@ public:
     void set_scheduler(IScheduler *scheduler);
 
     // Inherited overridden methods
-    IScheduler             *scheduler() override;
-    IAssetManager          *asset_manager() override;
-    const DeviceProperties &properties() override;
+    IScheduler    *scheduler() override;
+    IAssetManager *asset_manager() override;
 
 private:
-    std::unique_ptr<IScheduler> _owned_scheduler{ nullptr };
-    IScheduler                 *_scheduler{ nullptr };
-    DeviceProperties            _device_props{};
+    std::unique_ptr<IScheduler> _owned_scheduler{nullptr};
+    IScheduler                 *_scheduler{nullptr};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_RUNTIME_CONTEXT_H */
diff --git a/arm_compute/runtime/Scheduler.h b/arm_compute/runtime/Scheduler.h
index f8e9580710..7c83f86caa 100644
--- a/arm_compute/runtime/Scheduler.h
+++ b/arm_compute/runtime/Scheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_SCHEDULER_H
-#define ARM_COMPUTE_SCHEDULER_H
+#ifndef ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
+#define ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
 
 #include "arm_compute/runtime/IScheduler.h"
 
@@ -74,11 +74,11 @@ public:
     static bool is_available(Type t);
 
 private:
-    static Type                        _scheduler_type;
-    static std::shared_ptr<IScheduler> _custom_scheduler;
+    static Type                                        _scheduler_type;
+    static std::shared_ptr<IScheduler>                 _custom_scheduler;
     static std::map<Type, std::unique_ptr<IScheduler>> _schedulers;
 
     Scheduler();
 };
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_SCHEDULER_H */
+#endif // ACL_ARM_COMPUTE_RUNTIME_SCHEDULER_H
diff --git a/arm_compute/runtime/SchedulerFactory.h b/arm_compute/runtime/SchedulerFactory.h
index 5c24857b0b..647e486d42 100644
--- a/arm_compute/runtime/SchedulerFactory.h
+++ b/arm_compute/runtime/SchedulerFactory.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/SingleThreadScheduler.h b/arm_compute/runtime/SingleThreadScheduler.h
index 3f279ebb19..a9541d15a7 100644
--- a/arm_compute/runtime/SingleThreadScheduler.h
+++ b/arm_compute/runtime/SingleThreadScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,6 +50,14 @@ public:
      * @param[in] hints  Hints for the scheduler.
      */
     void schedule(ICPPKernel *kernel, const Hints &hints) override;
+    /** Runs the kernel in the same thread as the caller synchronously.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] hints   Hints for the scheduler.
+     * @param[in] window  Window to use for kernel execution.
+     * @param[in] tensors Vector containing the tensors to operate on.
+     */
+    void schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors) override;
 
 protected:
     /** Will run the workloads sequentially and in order.
diff --git a/arm_compute/runtime/SubTensor.h b/arm_compute/runtime/SubTensor.h
index b296877927..2badb31b26 100644
--- a/arm_compute/runtime/SubTensor.h
+++ b/arm_compute/runtime/SubTensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,5 +72,5 @@ private:
     ITensor              *_parent;
     mutable SubTensorInfo _info;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_SUBTENSOR_H */
diff --git a/arm_compute/runtime/Tensor.h b/arm_compute/runtime/Tensor.h
index 71ed294244..e71fbd4a96 100644
--- a/arm_compute/runtime/Tensor.h
+++ b/arm_compute/runtime/Tensor.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,7 +59,7 @@ public:
     ITensorInfo *info() const override;
     ITensorInfo *info() override;
     uint8_t     *buffer() const override;
-    void associate_memory_group(IMemoryGroup *memory_group) override;
+    void         associate_memory_group(IMemoryGroup *memory_group) override;
 
 private:
     mutable TensorAllocator _allocator; /**< Instance of the basic CPU allocator.*/
diff --git a/arm_compute/runtime/TensorAllocator.h b/arm_compute/runtime/TensorAllocator.h
index c0c6f2a5d2..d819931415 100644
--- a/arm_compute/runtime/TensorAllocator.h
+++ b/arm_compute/runtime/TensorAllocator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_TENSORALLOCATOR_H
 #define ARM_COMPUTE_TENSORALLOCATOR_H
 #include "arm_compute/runtime/ITensorAllocator.h"
-
 #include "arm_compute/runtime/Memory.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
diff --git a/arm_compute/runtime/Types.h b/arm_compute/runtime/Types.h
index 1b6e1bdba7..f5b7b06d2e 100644
--- a/arm_compute/runtime/Types.h
+++ b/arm_compute/runtime/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2019 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/arm_compute/runtime/common/LSTMParams.h b/arm_compute/runtime/common/LSTMParams.h
index 5e4a76afae..6800faf87f 100644
--- a/arm_compute/runtime/common/LSTMParams.h
+++ b/arm_compute/runtime/common/LSTMParams.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 ARM Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,8 +24,6 @@
 #ifndef ARM_COMPUTE_LSTMPARAMS_H
 #define ARM_COMPUTE_LSTMPARAMS_H
 
-#include "arm_compute/core/IPyramid.h"
-#include "arm_compute/core/PyramidInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -81,7 +79,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_cifg_params(const T *input_to_input_weights, const T *recurrent_to_input_weights, const T *cell_to_input_weights, const T *input_gate_bias)
+    LSTMParams &set_cifg_params(const T *input_to_input_weights,
+                                const T *recurrent_to_input_weights,
+                                T       *cell_to_input_weights,
+                                const T *input_gate_bias)
     {
         _input_to_input_weights     = input_to_input_weights;
         _recurrent_to_input_weights = recurrent_to_input_weights;
@@ -111,7 +112,7 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_peephole_params(const T *cell_to_forget_weights, const T *cell_to_output_weights)
+    LSTMParams &set_peephole_params(T *cell_to_forget_weights, T *cell_to_output_weights)
     {
         _cell_to_forget_weights = cell_to_forget_weights;
         _cell_to_output_weights = cell_to_output_weights;
@@ -127,8 +128,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_layer_normalization_params(const T *input_layer_norm_weights, const T *forget_layer_norm_weights,
-                                               const T *cell_layer_norm_weights, const T *output_layer_norm_weights)
+    LSTMParams &set_layer_normalization_params(T *input_layer_norm_weights,
+                                               T *forget_layer_norm_weights,
+                                               T *cell_layer_norm_weights,
+                                               T *output_layer_norm_weights)
     {
         _input_layer_norm_weights  = input_layer_norm_weights;
         _forget_layer_norm_weights = forget_layer_norm_weights;
@@ -171,7 +174,10 @@ public:
      *
      * @return Reference to this LSTMParams object
      */
-    LSTMParams &set_matmul_scale_params(float input_intermediate_scale, float forget_intermediate_scale, float cell_intermediate_scale, float output_intermediate_scale)
+    LSTMParams &set_matmul_scale_params(float input_intermediate_scale,
+                                        float forget_intermediate_scale,
+                                        float cell_intermediate_scale,
+                                        float output_intermediate_scale)
     {
         _input_intermediate_scale  = input_intermediate_scale;
         _forget_intermediate_scale = forget_intermediate_scale;
@@ -204,7 +210,7 @@ public:
         return _recurrent_to_input_weights;
     }
 
-    const T *cell_to_input_weights() const
+    T *cell_to_input_weights() const
     {
         return _cell_to_input_weights;
     }
@@ -214,12 +220,12 @@ public:
         return _input_gate_bias;
     }
 
-    const T *cell_to_forget_weights() const
+    T *cell_to_forget_weights() const
     {
         return _cell_to_forget_weights;
     }
 
-    const T *cell_to_output_weights() const
+    T *cell_to_output_weights() const
     {
         return _cell_to_output_weights;
     }
@@ -234,22 +240,22 @@ public:
         return _projection_bias;
     }
 
-    const T *input_layer_norm_weights() const
+    T *input_layer_norm_weights() const
     {
         return _input_layer_norm_weights;
     }
 
-    const T *forget_layer_norm_weights() const
+    T *forget_layer_norm_weights() const
     {
         return _forget_layer_norm_weights;
     }
 
-    const T *cell_layer_norm_weights() const
+    T *cell_layer_norm_weights() const
     {
         return _cell_layer_norm_weights;
     }
 
-    const T *output_layer_norm_weights() const
+    T *output_layer_norm_weights() const
     {
         return _output_layer_norm_weights;
     }
@@ -317,16 +323,16 @@ public:
 private:
     const T *_input_to_input_weights;
     const T *_recurrent_to_input_weights;
-    const T *_cell_to_input_weights;
+    T       *_cell_to_input_weights;
     const T *_input_gate_bias;
-    const T *_cell_to_forget_weights;
-    const T *_cell_to_output_weights;
+    T       *_cell_to_forget_weights;
+    T       *_cell_to_output_weights;
     const T *_projection_weights;
     const T *_projection_bias;
-    const T *_input_layer_norm_weights;
-    const T *_forget_layer_norm_weights;
-    const T *_cell_layer_norm_weights;
-    const T *_output_layer_norm_weights;
+    T       *_input_layer_norm_weights;
+    T       *_forget_layer_norm_weights;
+    T       *_cell_layer_norm_weights;
+    T       *_output_layer_norm_weights;
     float    _cell_clip;
     float    _projection_clip;
     float    _input_intermediate_scale;
@@ -340,5 +346,5 @@ private:
     bool     _has_cifg_opt;
     bool     _use_layer_norm;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_LSTMPARAMS_H */
diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/runtime/experimental/Types.h
index c09972353c..ce44253f82 100644
--- a/arm_compute/core/NEON/INEKernel.h
+++ b/arm_compute/runtime/experimental/Types.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,14 +21,21 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_INEKERNEL_H
-#define ARM_COMPUTE_INEKERNEL_H
+#ifndef ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H
+#define ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H
 
-#include "arm_compute/core/CPP/ICPPKernel.h"
+#include <vector>
 
 namespace arm_compute
 {
-/** Common interface for all kernels implemented in NEON. */
-using INEKernel = ICPPKernel;
+namespace experimental
+{
+/** Memory type */
+enum class MemoryType
+{
+    CPU,
+    CL,
+};
+} // namespace experimental
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_INEKERNEL_H */
+#endif /* ARM_COMPUTE_RUNTIME_EXPERIMENTAL_TYPES_H */