From c3c352e60050f3deacad767e429a88dc24b31af0 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 18 Mar 2021 10:59:40 +0000
Subject: Add Queue support

Queues are responsible for scheduling operators and performing other
runtime related activities like for example tuning.

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I0366d9048470d277b8cbf59fa42f95c0ae57c5c9
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5487
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |  25 +-
 SConscript                                         |  89 +++-
 arm_compute/Acl.hpp                                | 104 ++++-
 arm_compute/AclEntrypoints.h                       |  43 ++
 arm_compute/AclOpenClExt.h                         |  38 +-
 arm_compute/AclTypes.h                             |  18 +
 arm_compute/core/CL/OpenCL.h                       |   1 +
 docs/01_library.dox                                |  22 +-
 src/c/AclQueue.cpp                                 |  99 ++++
 src/c/cl/AclOpenClExt.cpp                          |  75 +++
 src/common/IContext.h                              |  16 +
 src/common/IQueue.h                                | 100 ++++
 src/core/CL/OpenCL.cpp                             |  18 +
 .../CpuDirectConvolutionOutputStageKernel.cpp      | 513 +++++++++++++++++++++
 .../kernels/CpuDirectConvolutionStageKernel.cpp    | 513 ---------------------
 src/core/cpu/kernels/activation/NEON/fp16.cpp      | 217 ---------
 src/core/cpu/kernels/activation/NEON/fp32.cpp      | 212 ---------
 src/core/cpu/kernels/activation/NEON/qasymm8.cpp   | 262 -----------
 .../cpu/kernels/activation/NEON/qasymm8_signed.cpp | 261 -----------
 src/core/cpu/kernels/activation/NEON/qsymm16.cpp   | 138 ------
 src/core/cpu/kernels/activation/SVE/fp16.cpp       | 130 ------
 src/core/cpu/kernels/activation/SVE/fp32.cpp       | 131 ------
 src/core/cpu/kernels/activation/SVE/qasymm8.cpp    | 254 ----------
 .../cpu/kernels/activation/SVE/qasymm8_signed.cpp  | 253 ----------
 src/core/cpu/kernels/activation/SVE/qsymm16.cpp    | 120 -----
 src/core/cpu/kernels/activation/neon/fp16.cpp      | 217 +++++++++
 src/core/cpu/kernels/activation/neon/fp32.cpp      | 212 +++++++++
 src/core/cpu/kernels/activation/neon/qasymm8.cpp   | 262 +++++++++++
 .../cpu/kernels/activation/neon/qasymm8_signed.cpp | 261 +++++++++++
 src/core/cpu/kernels/activation/neon/qsymm16.cpp   | 138 ++++++
 src/core/cpu/kernels/activation/sve/fp16.cpp       | 130 ++++++
 src/core/cpu/kernels/activation/sve/fp32.cpp       | 131 ++++++
 src/core/cpu/kernels/activation/sve/qasymm8.cpp    | 254 ++++++++++
 .../cpu/kernels/activation/sve/qasymm8_signed.cpp  | 253 ++++++++++
 src/core/cpu/kernels/activation/sve/qsymm16.cpp    | 120 +++++
 src/cpu/CpuContext.cpp                             |   6 +
 src/cpu/CpuContext.h                               |   1 +
 src/cpu/CpuQueue.cpp                               |  48 ++
 src/cpu/CpuQueue.h                                 |  56 +++
 src/gpu/cl/ClContext.cpp                           |  23 +-
 src/gpu/cl/ClContext.h                             |  12 +-
 src/gpu/cl/ClQueue.cpp                             | 102 ++++
 src/gpu/cl/ClQueue.h                               |  84 ++++
 tests/framework/Macros.h                           |   5 +
 tests/validation/cpu/unit/Context.cpp              |  18 +-
 tests/validation/cpu/unit/Queue.cpp                |  48 ++
 tests/validation/cpu/unit/Tensor.cpp               |  35 +-
 tests/validation/cpu/unit/TensorPack.cpp           |  22 +-
 tests/validation/fixtures/UNIT/Context.h           | 148 ------
 tests/validation/fixtures/UNIT/ContextFixture.h    | 148 ++++++
 tests/validation/fixtures/UNIT/QueueFixture.h      | 144 ++++++
 tests/validation/fixtures/UNIT/Tensor.h            | 424 -----------------
 tests/validation/fixtures/UNIT/TensorFixture.h     | 424 +++++++++++++++++
 tests/validation/fixtures/UNIT/TensorPack.h        | 184 --------
 tests/validation/fixtures/UNIT/TensorPackFixture.h | 184 ++++++++
 tests/validation/gpu/unit/Context.cpp              |  14 +-
 tests/validation/gpu/unit/Queue.cpp                |  81 ++++
 tests/validation/gpu/unit/Tensor.cpp               |  31 +-
 tests/validation/gpu/unit/TensorPack.cpp           |  22 +-
 59 files changed, 4516 insertions(+), 3378 deletions(-)
 create mode 100644 src/c/AclQueue.cpp
 create mode 100644 src/common/IQueue.h
 create mode 100644 src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
 delete mode 100644 src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp
 delete mode 100644 src/core/cpu/kernels/activation/NEON/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/activation/NEON/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/activation/NEON/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/activation/NEON/qsymm16.cpp
 delete mode 100644 src/core/cpu/kernels/activation/SVE/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/activation/SVE/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/activation/SVE/qasymm8.cpp
 delete mode 100644 src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
 delete mode 100644 src/core/cpu/kernels/activation/SVE/qsymm16.cpp
 create mode 100644 src/core/cpu/kernels/activation/neon/fp16.cpp
 create mode 100644 src/core/cpu/kernels/activation/neon/fp32.cpp
 create mode 100644 src/core/cpu/kernels/activation/neon/qasymm8.cpp
 create mode 100644 src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
 create mode 100644 src/core/cpu/kernels/activation/neon/qsymm16.cpp
 create mode 100644 src/core/cpu/kernels/activation/sve/fp16.cpp
 create mode 100644 src/core/cpu/kernels/activation/sve/fp32.cpp
 create mode 100644 src/core/cpu/kernels/activation/sve/qasymm8.cpp
 create mode 100644 src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
 create mode 100644 src/core/cpu/kernels/activation/sve/qsymm16.cpp
 create mode 100644 src/cpu/CpuQueue.cpp
 create mode 100644 src/cpu/CpuQueue.h
 create mode 100644 src/gpu/cl/ClQueue.cpp
 create mode 100644 src/gpu/cl/ClQueue.h
 create mode 100644 tests/validation/cpu/unit/Queue.cpp
 delete mode 100644 tests/validation/fixtures/UNIT/Context.h
 create mode 100644 tests/validation/fixtures/UNIT/ContextFixture.h
 create mode 100644 tests/validation/fixtures/UNIT/QueueFixture.h
 delete mode 100644 tests/validation/fixtures/UNIT/Tensor.h
 create mode 100644 tests/validation/fixtures/UNIT/TensorFixture.h
 delete mode 100644 tests/validation/fixtures/UNIT/TensorPack.h
 create mode 100644 tests/validation/fixtures/UNIT/TensorPackFixture.h
 create mode 100644 tests/validation/gpu/unit/Queue.cpp

diff --git a/Android.bp b/Android.bp
index f542b20104..78ac7f130f 100644
--- a/Android.bp
+++ b/Android.bp
@@ -52,6 +52,7 @@ cc_library_static {
     export_include_dirs: [".", "./include"],
     srcs: [
         "src/c/AclContext.cpp",
+        "src/c/AclQueue.cpp",
         "src/c/AclTensor.cpp",
         "src/c/AclTensorPack.cpp",
         "src/c/AclVersion.cpp",
@@ -300,7 +301,7 @@ cc_library_static {
         "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp",
         "src/core/cpu/kernels/CpuDequantizationKernel.cpp",
         "src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp",
-        "src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp",
+        "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
         "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
         "src/core/cpu/kernels/CpuFillKernel.cpp",
@@ -315,16 +316,16 @@ cc_library_static {
         "src/core/cpu/kernels/CpuSoftmaxKernel.cpp",
         "src/core/cpu/kernels/CpuSubKernel.cpp",
         "src/core/cpu/kernels/CpuTransposeKernel.cpp",
-        "src/core/cpu/kernels/activation/NEON/fp16.cpp",
-        "src/core/cpu/kernels/activation/NEON/fp32.cpp",
-        "src/core/cpu/kernels/activation/NEON/qasymm8.cpp",
-        "src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp",
-        "src/core/cpu/kernels/activation/NEON/qsymm16.cpp",
-        "src/core/cpu/kernels/activation/SVE/fp16.cpp",
-        "src/core/cpu/kernels/activation/SVE/fp32.cpp",
-        "src/core/cpu/kernels/activation/SVE/qasymm8.cpp",
-        "src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp",
-        "src/core/cpu/kernels/activation/SVE/qsymm16.cpp",
+        "src/core/cpu/kernels/activation/neon/fp16.cpp",
+        "src/core/cpu/kernels/activation/neon/fp32.cpp",
+        "src/core/cpu/kernels/activation/neon/qasymm8.cpp",
+        "src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp",
+        "src/core/cpu/kernels/activation/neon/qsymm16.cpp",
+        "src/core/cpu/kernels/activation/sve/fp16.cpp",
+        "src/core/cpu/kernels/activation/sve/fp32.cpp",
+        "src/core/cpu/kernels/activation/sve/qasymm8.cpp",
+        "src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp",
+        "src/core/cpu/kernels/activation/sve/qsymm16.cpp",
         "src/core/cpu/kernels/add/neon/integer.cpp",
         "src/core/cpu/kernels/add/neon/qasymm8.cpp",
         "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp",
@@ -390,8 +391,10 @@ cc_library_static {
         "src/core/utils/misc/MMappedFile.cpp",
         "src/core/utils/quantization/AsymmHelpers.cpp",
         "src/cpu/CpuContext.cpp",
+        "src/cpu/CpuQueue.cpp",
         "src/cpu/CpuTensor.cpp",
         "src/gpu/cl/ClContext.cpp",
+        "src/gpu/cl/ClQueue.cpp",
         "src/gpu/cl/ClTensor.cpp",
         "src/runtime/Allocator.cpp",
         "src/runtime/BlobLifetimeManager.cpp",
diff --git a/SConscript b/SConscript
index b09551fc82..63c2a483dc 100644
--- a/SConscript
+++ b/SConscript
@@ -188,11 +188,25 @@ runtime_files = Glob('src/runtime/*.cpp')
 runtime_files += Glob('src/runtime/CPP/ICPPSimpleFunction.cpp')
 runtime_files += Glob('src/runtime/CPP/functions/*.cpp')
 
-runtime_files += Glob('src/c/*.cpp')
-runtime_files += Glob('src/common/*.cpp')
-runtime_files += Glob('src/common/utils/*.cpp')
-runtime_files += Glob('src/cpu/*.cpp')
+# C API files
+c_api_files = ['src/c/AclContext.cpp',
+               'src/c/AclQueue.cpp',
+               'src/c/AclTensor.cpp',
+               'src/c/AclTensorPack.cpp',
+               'src/c/AclVersion.cpp',
+               ]
+if env['opencl']:
+    c_api_files += ['src/c/cl/AclOpenClExt.cpp']
+
+# Common backend files
+common_backend_files = ['src/common/utils/LegacySupport.cpp',
+                        'src/common/AllocatorWrapper.cpp',
+                        'src/common/ITensorV2.cpp',
+                        'src/common/TensorPack.cpp',
+                        ]
 
+core_files += common_backend_files
+runtime_files += c_api_files
 # CLHarrisCorners uses the Scheduler to run CPP kernels
 runtime_files += Glob('src/runtime/CPP/SingleThreadScheduler.cpp')
 
@@ -225,7 +239,6 @@ if env['opencl']:
     runtime_files += Glob('src/runtime/CL/gemm_auto_heuristics/*.cpp')
 
     runtime_files += Glob('src/gpu/cl/*.cpp')
-    runtime_files += Glob('src/c/cl/*.cpp')
 
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
@@ -278,8 +291,36 @@ if env['neon']:
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/assembly/*.cpp')
 
-    core_files += Glob('src/core/cpu/*.cpp')
-    core_files += Glob('src/core/cpu/kernels/*.cpp')
+    cpu_kernel_hp_files = ['src/core/cpu/kernels/CpuActivationKernel.cpp',
+                           'src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp',
+                           'src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp',
+                           'src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp',
+                           'src/core/cpu/kernels/CpuPermuteKernel.cpp',
+                           'src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp',
+                           'src/core/cpu/kernels/CpuPoolingKernel.cpp',
+                           'src/core/cpu/kernels/CpuReshapeKernel.cpp',
+                          ]
+    cpu_kernel_files = ['src/core/cpu/kernels/CpuAddKernel.cpp',
+                        'src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp',
+                        'src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp',
+                        'src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp',
+                        'src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp',
+                        'src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp',
+                        'src/core/cpu/kernels/CpuCopyKernel.cpp',
+                        'src/core/cpu/kernels/CpuDequantizationKernel.cpp',
+                        'src/core/cpu/kernels/CpuElementwiseKernel.cpp',
+                        'src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp',
+                        'src/core/cpu/kernels/CpuFillKernel.cpp',
+                        'src/core/cpu/kernels/CpuFloorKernel.cpp',
+                        'src/core/cpu/kernels/CpuPixelWiseMultiplicationKernel.cpp',
+                        'src/core/cpu/kernels/CpuQuantizationKernel.cpp',
+                        'src/core/cpu/kernels/CpuScaleKernel.cpp',
+                        'src/core/cpu/kernels/CpuSoftmaxKernel.cpp',
+                        'src/core/cpu/kernels/CpuSubKernel.cpp',
+                        'src/core/cpu/kernels/CpuTransposeKernel.cpp',
+                       ]
+    core_files += [cpu_kernel_hp_files, cpu_kernel_files]
+
     core_files += Glob('src/core/cpu/kernels/*/*.cpp')
     if any(i in env['data_type_support'] for i in ['all', 'fp16']):
         core_files += Glob('src/core/cpu/kernels/*/*/fp16.cpp')
@@ -293,12 +334,40 @@ if env['neon']:
         core_files += Glob('src/core/cpu/kernels/*/*/qsymm16.cpp')
     if any(i in env['data_type_support'] for i in ['all', 'integer']):
         core_files += Glob('src/core/cpu/kernels/*/*/integer.cpp')
-   
+
     if any(i in env['data_layout_support'] for i in ['all', 'nchw']):
         core_files += Glob('src/core/cpu/kernels/*/*/nchw/all.cpp')
 
-    runtime_files += Glob('src/runtime/cpu/*.cpp')
-    runtime_files += Glob('src/runtime/cpu/operators/*.cpp')
+    cpu_rt_files = ['src/cpu/CpuContext.cpp',
+                    'src/cpu/CpuQueue.cpp',
+                    'src/cpu/CpuTensor.cpp'
+                   ]
+    cpu_operator_hp_files = ['src/runtime/cpu/operators/CpuActivation.cpp',
+                             'src/runtime/cpu/operators/CpuDepthwiseConvolution.cpp',
+                             'src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.cpp',
+                             'src/runtime/cpu/operators/CpuDirectConvolution.cpp',
+                             'src/runtime/cpu/operators/CpuPermute.cpp',
+                             'src/runtime/cpu/operators/CpuPooling.cpp',
+                             'src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp',
+                            ]
+    cpu_operator_files = ['src/runtime/cpu/operators/CpuAdd.cpp',
+                          'src/runtime/cpu/operators/CpuConcatenate.cpp',
+                          'src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp',
+                          'src/runtime/cpu/operators/CpuCopy.cpp',
+                          'src/runtime/cpu/operators/CpuDequantization.cpp',
+                          'src/runtime/cpu/operators/CpuElementwise.cpp',
+                          'src/runtime/cpu/operators/CpuElementwiseUnary.cpp',
+                          'src/runtime/cpu/operators/CpuFill.cpp',
+                          'src/runtime/cpu/operators/CpuFloor.cpp',
+                          'src/runtime/cpu/operators/CpuPixelWiseMultiplication.cpp',
+                          'src/runtime/cpu/operators/CpuQuantization.cpp',
+                          'src/runtime/cpu/operators/CpuReshape.cpp',
+                          'src/runtime/cpu/operators/CpuScale.cpp',
+                          'src/runtime/cpu/operators/CpuSoftmax.cpp',
+                          'src/runtime/cpu/operators/CpuSub.cpp',
+                          'src/runtime/cpu/operators/CpuTranspose.cpp',
+                         ]
+    runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files ]
 
 bootcode_o = []
 if env['os'] == 'bare_metal':
diff --git a/arm_compute/Acl.hpp b/arm_compute/Acl.hpp
index 01f7179c2f..93ac2d8ed9 100644
--- a/arm_compute/Acl.hpp
+++ b/arm_compute/Acl.hpp
@@ -42,6 +42,7 @@ namespace acl
 {
 // Forward declarations
 class Context;
+class Queue;
 class Tensor;
 class TensorPack;
 
@@ -83,6 +84,7 @@ struct ObjectDeleter
     };
 
 OBJECT_DELETER(AclContext, AclDestroyContext)
+OBJECT_DELETER(AclQueue, AclDestroyQueue)
 OBJECT_DELETER(AclTensor, AclDestroyTensor)
 OBJECT_DELETER(AclTensorPack, AclDestroyTensorPack)
 
@@ -384,7 +386,7 @@ public:
         AclContext ctx;
         const auto st = detail::as_enum<StatusCode>(AclCreateContext(&ctx, detail::as_cenum<AclTarget>(target), &options.copts));
         reset(ctx);
-        report_status(st, "[Arm Compute Library] Failed to create context");
+        report_status(st, "[Compute Library] Failed to create context");
         if(status)
         {
             *status = st;
@@ -392,6 +394,92 @@ public:
     }
 };
 
+/**< Available tuning modes */
+enum class TuningMode
+{
+    Rapid      = AclRapid,
+    Normal     = AclNormal,
+    Exhaustive = AclExhaustive
+};
+
+/** Queue class
+ *
+ * Queue is responsible for the execution related aspects, with main responsibilities those of
+ * scheduling and tuning operators.
+ *
+ * Multiple queues can be created from the same context, and the same operator can be scheduled on each concurrently.
+ *
+ * @note An operator might depend on the maximum possible compute units that are provided in the context,
+ *       thus in cases where the number of the scheduling units of the queue are greater might lead to errors.
+ */
+class Queue : public detail::ObjectBase<AclQueue_>
+{
+public:
+    /**< Queue options */
+    struct Options
+    {
+        /** Default Constructor
+         *
+         * As default options, no tuning will be performed, and the number of scheduling units will
+         * depends on internal device discovery functionality
+         */
+        Options()
+            : opts{ AclTuningModeNone, 0 } {};
+        /** Constructor
+         *
+         * @param[in] mode          Tuning mode to be used
+         * @param[in] compute_units Number of scheduling units to be used
+         */
+        Options(TuningMode mode, int32_t compute_units)
+            : opts{ detail::as_cenum<AclTuningMode>(mode), compute_units }
+        {
+        }
+
+        AclQueueOptions opts;
+    };
+
+public:
+    /** Constructor
+     *
+     * @note Serves as a simpler delegate constructor
+     * @note As queue options, default conservative options will be used
+     *
+     * @param[in]  ctx    Context to create queue for
+     * @param[out] status Status information if requested
+     */
+    explicit Queue(Context &ctx, StatusCode *status = nullptr)
+        : Queue(ctx, Options(), status)
+    {
+    }
+    /** Constructor
+     *
+     * @note As queue options, default conservative options will be used
+     *
+     * @param[in]  ctx     Context from where the queue will be created from
+     * @param[in]  options Queue options to be used
+     * @param[out] status  Status information if requested
+     */
+    explicit Queue(Context &ctx, const Options &options = Options(), StatusCode *status = nullptr)
+    {
+        AclQueue   queue;
+        const auto st = detail::as_enum<StatusCode>(AclCreateQueue(&queue, ctx.get(), &options.opts));
+        reset(queue);
+        report_status(st, "[Compute Library] Failed to create queue!");
+        if(status)
+        {
+            *status = st;
+        }
+    }
+    /** Block until all the tasks of the queue have been marked as finished
+     *
+     * @return Status code
+     */
+    StatusCode finish()
+    {
+        return detail::as_enum<StatusCode>(AclQueueFinish(_object.get()));
+    }
+};
+
 /**< Data type enumeration */
 enum class DataType
 {
@@ -519,7 +607,7 @@ public:
         AclTensor  tensor;
         const auto st = detail::as_enum<StatusCode>(AclCreateTensor(&tensor, ctx.get(), desc.get(), allocate));
         reset(tensor);
-        report_status(st, "[Arm Compute Library] Failed to create tensor!");
+        report_status(st, "[Compute Library] Failed to create tensor!");
         if(status)
         {
             *status = st;
@@ -533,7 +621,7 @@ public:
     {
         void      *handle = nullptr;
         const auto st     = detail::as_enum<StatusCode>(AclMapTensor(_object.get(), &handle));
-        report_status(st, "[Arm Compute Library] Failed to map the tensor and extract the tensor's backing memory!");
+        report_status(st, "[Compute Library] Failed to map the tensor and extract the tensor's backing memory!");
         return handle;
     }
     /** Unmaps tensor's memory
@@ -545,7 +633,7 @@ public:
     StatusCode unmap(void *handle)
     {
         const auto st = detail::as_enum<StatusCode>(AclUnmapTensor(_object.get(), handle));
-        report_status(st, "[Arm Compute Library] Failed to unmap the tensor!");
+        report_status(st, "[Compute Library] Failed to unmap the tensor!");
         return st;
     }
     /** Import external memory to a given tensor object
@@ -558,7 +646,7 @@ public:
     StatusCode import(void *handle, ImportType type)
     {
         const auto st = detail::as_enum<StatusCode>(AclTensorImport(_object.get(), handle, detail::as_cenum<AclImportMemoryType>(type)));
-        report_status(st, "[Arm Compute Library] Failed to import external memory to tensor!");
+        report_status(st, "[Compute Library] Failed to import external memory to tensor!");
         return st;
     }
     /** Get the size of the tensor in byte
@@ -571,7 +659,7 @@ public:
     {
         uint64_t   size{ 0 };
         const auto st = detail::as_enum<StatusCode>(AclGetTensorSize(_object.get(), &size));
-        report_status(st, "[Arm Compute Library] Failed to get the size of the tensor");
+        report_status(st, "[Compute Library] Failed to get the size of the tensor");
         return size;
     }
     /** Get the descriptor of this tensor
@@ -582,7 +670,7 @@ public:
     {
         AclTensorDescriptor desc;
         const auto          st = detail::as_enum<StatusCode>(AclGetTensorDescriptor(_object.get(), &desc));
-        report_status(st, "[Arm Compute Library] Failed to get the descriptor of the tensor");
+        report_status(st, "[Compute Library] Failed to get the descriptor of the tensor");
         return TensorDescriptor(desc);
     }
 };
@@ -623,7 +711,7 @@ public:
         AclTensorPack pack;
         const auto    st = detail::as_enum<StatusCode>(AclCreateTensorPack(&pack, ctx.get()));
         reset(pack);
-        report_status(st, "[Arm Compute Library] Failure during tensor pack creation");
+        report_status(st, "[Compute Library] Failure during tensor pack creation");
         if(status)
         {
             *status = st;
diff --git a/arm_compute/AclEntrypoints.h b/arm_compute/AclEntrypoints.h
index cd974341c2..cf4a237a44 100644
--- a/arm_compute/AclEntrypoints.h
+++ b/arm_compute/AclEntrypoints.h
@@ -62,6 +62,49 @@ AclStatus AclCreateContext(AclContext              *ctx,
  */
 AclStatus AclDestroyContext(AclContext ctx);
 
+/** Create an operator queue
+ *
+ * Queue is responsible for any scheduling related activities
+ *
+ * @param[in, out] queue   A valid non-zero queue object is not failures occur
+ * @param[in]      ctx     Context to be used
+ * @param[in]      options Queue options to be used for the operators using the queue
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if function was completed successfully
+ *  - @ref AclOutOfMemory if there was a failure allocating memory resources
+ *  - @ref AclUnsupportedTarget if the requested target is unsupported
+ *  - @ref AclInvalidArgument if a given argument is invalid
+ */
+AclStatus AclCreateQueue(AclQueue *queue, AclContext ctx, const AclQueueOptions *options);
+
+/** Wait until all elements on the queue have been completed
+ *
+ * @param[in] queue Queue to wait on completion
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided queue is invalid
+ *  - @ref AclRuntimeError on any other runtime related error
+ */
+AclStatus AclQueueFinish(AclQueue queue);
+
+/** Destroy a given queue object
+ *
+ * @param[in] queue A valid context object to destroy
+ *
+ * @return Status code
+ *
+ * Returns:
+ *  - @ref AclSuccess if functions was completed successfully
+ *  - @ref AclInvalidArgument if the provided context is invalid
+ */
+AclStatus AclDestroyQueue(AclQueue queue);
+
 /** Create a Tensor object
  *
  * Tensor is a generalized matrix construct that can represent up to ND dimensionality (where N = 6 for Compute Library)
diff --git a/arm_compute/AclOpenClExt.h b/arm_compute/AclOpenClExt.h
index 15b233ca12..b9080dabf2 100644
--- a/arm_compute/AclOpenClExt.h
+++ b/arm_compute/AclOpenClExt.h
@@ -43,7 +43,6 @@ extern "C" {
 /** Extract the underlying OpenCL context used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
- * @note @ref AclContext refcount should be 0, meaning not used by other objects
  *
  * @param[in]  ctx            A valid non-zero context
  * @param[out] opencl_context Underlying OpenCL context used
@@ -52,7 +51,18 @@ extern "C" {
  */
 AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
 
-/** Set the underlying OpenCL context used by a given Compute Library context object
+/** Extract the underlying OpenCL device id used by a given Compute Library context object
+ *
+ * @note @ref AclContext should be of an OpenCL backend target
+ *
+ * @param[in]  ctx           A valid non-zero context
+ * @param[out] opencl_device Underlying OpenCL device used
+ *
+ * @return Status code
+ */
+AclStatus AclGetClDevice(AclContext ctx, cl_device_id *opencl_device);
+
+/** Set the underlying OpenCL context to be used by a given Compute Library context object
  *
  * @note @ref AclContext should be of an OpenCL backend target
  *
@@ -63,6 +73,30 @@ AclStatus AclGetClContext(AclContext ctx, cl_context *opencl_context);
  */
 AclStatus AclSetClContext(AclContext ctx, cl_context opencl_context);
 
+/** Extract the underlying OpenCL queue used by a given Compute Library queue object
+ *
+ * @note @ref AclQueue should be of an OpenCL backend target
+ * @note @ref AclQueue refcount should be 0, meaning not used by other objects
+ *
+ * @param[in]  queue        A valid non-zero queue
+ * @param[out] opencl_queue Underlying OpenCL queue used
+ *
+ * @return Status code
+ */
+AclStatus AclGetClQueue(AclQueue queue, cl_command_queue *opencl_queue);
+
+/** Set the underlying OpenCL queue to be used by a given Compute Library queue object
+ *
+ * @note @ref AclQueue should be of an OpenCL backend target
+ * @note opecl_queue needs to be created from the same context that the AclContext that the queue will use
+ *
+ * @param[in]  queue        A valid non-zero queue object
+ * @param[out] opencl_queue Underlying OpenCL queue to be used
+ *
+ * @return Status code
+ */
+AclStatus AclSetClQueue(AclQueue queue, cl_command_queue opencl_queue);
+
 /** Extract the underlying OpenCL memory object by a given Compute Library tensor object
  *
  * @param[in]  tensor     A valid non-zero tensor
diff --git a/arm_compute/AclTypes.h b/arm_compute/AclTypes.h
index 69717ec8a8..902a508b91 100644
--- a/arm_compute/AclTypes.h
+++ b/arm_compute/AclTypes.h
@@ -33,6 +33,8 @@ extern "C" {
 
 /**< Opaque Context object */
 typedef struct AclContext_ *AclContext;
+/**< Opaque Queue object */
+typedef struct AclQueue_ *AclQueue;
 /**< Opaque Tensor object */
 typedef struct AclTensor_ *AclTensor;
 /**< Opaque Tensor pack object */
@@ -138,6 +140,22 @@ typedef struct AclContextOptions
     AclAllocator         *allocator;          /**< Allocator to be used by all the memory internally */
 } AclContextOptions;
 
+/**< Supported tuning modes */
+typedef enum
+{
+    AclTuningModeNone = 0, /**< No tuning */
+    AclRapid          = 1, /**< Fast tuning mode, testing a small portion of the tuning space */
+    AclNormal         = 2, /**< Normal tuning mode, gives a good balance between tuning mode and performance */
+    AclExhaustive     = 3, /**< Exhaustive tuning mode, increased tuning time but with best results */
+} AclTuningMode;
+
+/**< Queue options */
+typedef struct
+{
+    AclTuningMode mode;          /**< Tuning mode */
+    int32_t       compute_units; /**< Compute Units that the queue will deploy */
+} AclQueueOptions;
+
 /**< Supported data types */
 typedef enum AclDataType
 {
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index 1e6b04c042..bbe469f1a8 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -92,6 +92,7 @@ public:
     DECLARE_FUNCTION_PTR(clCreateContext);
     DECLARE_FUNCTION_PTR(clCreateContextFromType);
     DECLARE_FUNCTION_PTR(clCreateCommandQueue);
+    DECLARE_FUNCTION_PTR(clCreateCommandQueueWithProperties);
     DECLARE_FUNCTION_PTR(clGetContextInfo);
     DECLARE_FUNCTION_PTR(clBuildProgram);
     DECLARE_FUNCTION_PTR(clEnqueueNDRangeKernel);
diff --git a/docs/01_library.dox b/docs/01_library.dox
index 722a07f958..25535d111a 100644
--- a/docs/01_library.dox
+++ b/docs/01_library.dox
@@ -508,7 +508,27 @@ However this process takes quite a lot of time, which is why it cannot be enable
 
 But, when the @ref CLTuner is disabled ( Target = 1 for the graph examples), the @ref graph::Graph will try to reload the file containing the tuning parameters, then for each executed kernel the Compute Library will use the fine tuned LWS if it was present in the file or use a default LWS value if it's not.
 
-@section S4_10_weights_manager Weights Manager
+@section S4_10_cl_queue_prioritites OpenCL Queue Priorities
+
+OpenCL 2.1 exposes the `cl_khr_priority_hints` extensions that if supported by an underlying implementation allows the user to specify priority hints to the created command queues.
+Is important to note that this does not specify guarantees or the explicit scheduling behavior, this is something that each implementation needs to expose.
+
+In some cases, priority queues can be used when there is an implicit internal priority between graphics and compute queues and thus allow some level of priority control between them.
+At the moment three priority level can be specified:
+- CL_QUEUE_PRIORITY_HIGH_KHR
+- CL_QUEUE_PRIORITY_MED_KHR
+- CL_QUEUE_PRIORITY_LOW_KHR
+
+Compute Library allows extraction of the internal OpenCL queue or the ability to inject directly a user-defined queue to the @ref CLScheduler.
+This way the user can utilize this extension to define priorities between the queues and setup the OpenCL scheduler mechanism to utilize them.
+
+@code{.cpp}
+cl_queue_properties queue_properties[] = {CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_HIGH_KHR, 0};
+cl_command_queue priority_queue = clCreateCommandQueueWithProperties(ctx, dev, queue_properties, &error);
+CLScheduler::get().set_queue(::cl::CommandQueue(priority_queue));
+@endcode
+
+@section S4_11_weights_manager Weights Manager
 
 @ref IWeightsManager is a weights managing interface that can be used to reduce the memory requirements of a given pipeline by reusing transformed weights across multiple function executions.
 @ref IWeightsManager is responsible for managing weight tensors alongside with their transformations.
diff --git a/src/c/AclQueue.cpp b/src/c/AclQueue.cpp
new file mode 100644
index 0000000000..020c6ed531
--- /dev/null
+++ b/src/c/AclQueue.cpp
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/AclEntrypoints.h"
+
+#include "src/common/IQueue.h"
+#include "src/common/utils/Macros.h"
+#include "src/common/utils/Validate.h"
+
+namespace
+{
+/** Check if queue options are valid
+ *
+ * @param[in] options Queue options
+ *
+ * @return true in case of success else false
+ */
+bool is_mode_valid(const AclQueueOptions *options)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
+    return arm_compute::utils::is_in(options->mode, { AclTuningModeNone, AclRapid, AclNormal, AclExhaustive });
+}
+} // namespace
+
+extern "C" AclStatus AclCreateQueue(AclQueue *external_queue, AclContext external_ctx, const AclQueueOptions *options)
+{
+    using namespace arm_compute;
+
+    auto ctx = get_internal(external_ctx);
+
+    StatusCode status = detail::validate_internal_context(ctx);
+    ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
+
+    if(options != nullptr && !is_mode_valid(options))
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL("Queue options are invalid");
+        return AclInvalidArgument;
+    }
+
+    auto queue = ctx->create_queue(options);
+    if(queue == nullptr)
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
+        return AclOutOfMemory;
+    }
+
+    *external_queue = queue;
+
+    return AclSuccess;
+}
+
+extern "C" AclStatus AclQueueFinish(AclQueue external_queue)
+{
+    using namespace arm_compute;
+
+    auto queue = get_internal(external_queue);
+
+    StatusCode status = detail::validate_internal_queue(queue);
+    ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
+
+    status = queue->finish();
+    ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
+
+    return AclSuccess;
+}
+
+extern "C" AclStatus AclDestroyQueue(AclQueue external_queue)
+{
+    using namespace arm_compute;
+
+    auto queue = get_internal(external_queue);
+
+    StatusCode status = detail::validate_internal_queue(queue);
+    ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
+
+    delete queue;
+
+    return AclSuccess;
+}
diff --git a/src/c/cl/AclOpenClExt.cpp b/src/c/cl/AclOpenClExt.cpp
index ce6d2969de..e72babcae8 100644
--- a/src/c/cl/AclOpenClExt.cpp
+++ b/src/c/cl/AclOpenClExt.cpp
@@ -26,6 +26,7 @@
 #include "src/common/ITensorV2.h"
 #include "src/common/Types.h"
 #include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/ClQueue.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
 
@@ -85,6 +86,80 @@ extern "C" AclStatus AclSetClContext(AclContext external_ctx, cl_context opencl_
     return AclStatus::AclSuccess;
 }
 
+extern "C" AclStatus AclGetClDevice(AclContext external_ctx, cl_device_id *opencl_device)
+{
+    using namespace arm_compute;
+    IContext *ctx = get_internal(external_ctx);
+
+    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    {
+        return AclStatus::AclInvalidArgument;
+    }
+
+    if(ctx->type() != Target::GpuOcl)
+    {
+        return AclStatus::AclInvalidTarget;
+    }
+
+    if(opencl_device == nullptr)
+    {
+        return AclStatus::AclInvalidArgument;
+    }
+
+    *opencl_device = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClContext *>(ctx)->cl_dev().get();
+
+    return AclStatus::AclSuccess;
+}
+
+extern "C" AclStatus AclGetClQueue(AclQueue external_queue, cl_command_queue *opencl_queue)
+{
+    using namespace arm_compute;
+    IQueue *queue = get_internal(external_queue);
+
+    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    {
+        return AclStatus::AclInvalidArgument;
+    }
+
+    if(queue->header.ctx->type() != Target::GpuOcl)
+    {
+        return AclStatus::AclInvalidTarget;
+    }
+
+    if(opencl_queue == nullptr)
+    {
+        return AclStatus::AclInvalidArgument;
+    }
+
+    *opencl_queue = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClQueue *>(queue)->cl_queue().get();
+
+    return AclStatus::AclSuccess;
+}
+
+extern "C" AclStatus AclSetClQueue(AclQueue external_queue, cl_command_queue opencl_queue)
+{
+    using namespace arm_compute;
+    IQueue *queue = get_internal(external_queue);
+
+    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    {
+        return AclStatus::AclInvalidArgument;
+    }
+
+    if(queue->header.ctx->type() != Target::GpuOcl)
+    {
+        return AclStatus::AclInvalidTarget;
+    }
+
+    auto cl_queue = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClQueue *>(queue);
+    if(!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
+    {
+        return AclStatus::AclRuntimeError;
+    }
+
+    return AclStatus::AclSuccess;
+}
+
 extern "C" AclStatus AclGetClMem(AclTensor external_tensor, cl_mem *opencl_mem)
 {
     using namespace arm_compute;
diff --git a/src/common/IContext.h b/src/common/IContext.h
index ee234795cf..31f39da06d 100644
--- a/src/common/IContext.h
+++ b/src/common/IContext.h
@@ -43,6 +43,7 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensorV2;
+class IQueue;
 
 /**< Context interface */
 class IContext : public AclContext_
@@ -52,11 +53,13 @@ public:
         : AclContext_(), _target(target), _refcount(0)
     {
     }
+
     /** Virtual Destructor */
     virtual ~IContext()
     {
         header.type = detail::ObjectType::Invalid;
     };
+
     /** Target type accessor
      *
      * @return Target that the context is associated with
@@ -65,16 +68,19 @@ public:
     {
         return _target;
     }
+
     /** Increment context refcount */
     void inc_ref() const
     {
         ++_refcount;
     }
+
     /** Decrement context refcount */
     void dec_ref() const
     {
         --_refcount;
     }
+
     /** Reference counter accessor
      *
      * @return The number of references pointing to this object
@@ -83,6 +89,7 @@ public:
     {
         return _refcount;
     }
+
     /** Checks if an object is valid
      *
      * @return True if sucessful otherwise false
@@ -91,6 +98,7 @@ public:
     {
         return header.type == detail::ObjectType::Context;
     }
+
     /** Create a tensor object
      *
      * @param[in] desc     Descriptor to use
@@ -100,6 +108,14 @@ public:
      */
     virtual ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) = 0;
 
+    /** Create a queue object
+     *
+     * @param[in] options Queue options to be used
+     *
+     * @return A pointer to the created queue object
+     */
+    virtual IQueue *create_queue(const AclQueueOptions *options) = 0;
+
 private:
     Target                   _target;   /**< Target type of context */
     mutable std::atomic<int> _refcount; /**< Reference counter */
diff --git a/src/common/IQueue.h b/src/common/IQueue.h
new file mode 100644
index 0000000000..6a0cbc75da
--- /dev/null
+++ b/src/common/IQueue.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_COMMON_IQUEUE_H_
+#define SRC_COMMON_IQUEUE_H_
+
+#include "src/common/IContext.h"
+
+struct AclQueue_
+{
+    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Queue, nullptr };
+
+protected:
+    AclQueue_()  = default;
+    ~AclQueue_() = default;
+};
+
+namespace arm_compute
+{
+/** Base class specifying the queue interface */
+class IQueue : public AclQueue_
+{
+public:
+    /** Explict Operator Constructor
+     *
+     * @param[in] ctx Context to be used by the operator
+     */
+    explicit IQueue(IContext *ctx)
+    {
+        this->header.ctx = ctx;
+        this->header.ctx->inc_ref();
+    }
+    /** Destructor */
+    virtual ~IQueue()
+    {
+        this->header.ctx->dec_ref();
+        this->header.type = detail::ObjectType::Invalid;
+    };
+    /** Checks if a queue is valid
+     *
+     * @return True if successful otherwise false
+     */
+    bool is_valid() const
+    {
+        return this->header.type == detail::ObjectType::Queue;
+    };
+    virtual StatusCode finish() = 0;
+};
+
+/** Extract internal representation of a Queue
+ *
+ * @param[in] queue Opaque queue pointer
+ *
+ * @return The internal representation as an IQueue
+ */
+inline IQueue *get_internal(AclQueue queue)
+{
+    return static_cast<IQueue *>(queue);
+}
+
+namespace detail
+{
+/** Check if an internal queue is valid
+ *
+ * @param[in] queue Internal queue to check
+ *
+ * @return A status code
+ */
+inline StatusCode validate_internal_queue(const IQueue *queue)
+{
+    if(queue == nullptr || !queue->is_valid())
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
+        return StatusCode::InvalidArgument;
+    }
+    return StatusCode::Success;
+}
+} // namespace detail
+} // namespace arm_compute
+#endif /* SRC_COMMON_IQUEUE_H_ */
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index a7be534397..d8c2736ef7 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -91,6 +91,7 @@ bool CLSymbols::load(const std::string &library)
     LOAD_FUNCTION_PTR(clCreateContext, handle);
     LOAD_FUNCTION_PTR(clCreateContextFromType, handle);
     LOAD_FUNCTION_PTR(clCreateCommandQueue, handle);
+    LOAD_FUNCTION_PTR(clCreateCommandQueueWithProperties, handle);
     LOAD_FUNCTION_PTR(clGetContextInfo, handle);
     LOAD_FUNCTION_PTR(clBuildProgram, handle);
     LOAD_FUNCTION_PTR(clEnqueueNDRangeKernel, handle);
@@ -293,6 +294,23 @@ cl_command_queue clCreateCommandQueue(cl_context                  context,
     }
 }
 
+cl_command_queue clCreateCommandQueueWithProperties(cl_context                 context,
+                                                    cl_device_id               device,
+                                                    const cl_queue_properties *properties,
+                                                    cl_int                    *errcode_ret)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr;
+    if(func != nullptr)
+    {
+        return func(context, device, properties, errcode_ret);
+    }
+    else
+    {
+        return nullptr;
+    }
+}
+
 cl_context clCreateContext(
     const cl_context_properties *properties,
     cl_uint                      num_devices,
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
new file mode 100644
index 0000000000..5f7a574e5a
--- /dev/null
+++ b/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
@@ -0,0 +1,513 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                          const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+    }
+
+    if(src->data_type() == DataType::S32)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
+    }
+
+    // Checks performed when output is configured
+    if((dst != nullptr) && (dst->total_size() != 0))
+    {
+        if(is_data_type_float(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+    else if(src->data_type() == DataType::S32)
+    {
+        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
+        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
+    }
+
+    return Status{};
+}
+
+template <typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+            auto       v_in   = wrapper::vloadq(in_ptr);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                v_in          = wrapper::vadd(v_in, vb);
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, v_in);
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                s_in += b;
+            }
+
+            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
+        }
+
+    },
+    in, out);
+}
+
+template <typename T>
+typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
+output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+    ARM_COMPUTE_UNUSED(result_shift);
+    ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+            auto       v_in   = wrapper::vloadq(in_ptr + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            wrapper::vstore(out_ptr + x, v_in);
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                s_in += *bias_ptr;
+            }
+
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            *(out_ptr + x)     = s_in;
+        }
+    },
+    in, bi, out);
+}
+
+// Quantized case
+template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
+void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32x4x4_t v_in =
+            {
+                {
+                    wrapper::vloadq(in_ptr),
+                    wrapper::vloadq(in_ptr + 4),
+                    wrapper::vloadq(in_ptr + 8),
+                    wrapper::vloadq(in_ptr + 12)
+                }
+            };
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                v_in =
+                {
+                    {
+                        wrapper::vadd(v_in.val[0], vb),
+                        wrapper::vadd(v_in.val[1], vb),
+                        wrapper::vadd(v_in.val[2], vb),
+                        wrapper::vadd(v_in.val[3], vb)
+                    }
+                };
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
+                                                           min, max, false));
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                s_in += b;
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+        }
+    },
+    in, out);
+}
+template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
+void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
+                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+    const bool has_bias = bias != nullptr;
+    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
+    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
+
+    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+
+    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
+    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
+
+    Window window_bias = window;
+    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
+    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
+    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    window_bias.set(3, Window::Dimension(0, 0, 0));
+
+    const int window_start_x = window.x().start();
+    const int window_end_x   = window.x().end();
+    const int window_step_x  = 16 / src->info()->element_size();
+    Window    win            = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator in(src, win);
+    Iterator bi(bias, window_bias);
+    Iterator out(dst, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            // Get bias and pointer to input
+            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32x4x4_t v_in =
+            {
+                {
+                    wrapper::vloadq(in_ptr),
+                    wrapper::vloadq(in_ptr + 4),
+                    wrapper::vloadq(in_ptr + 8),
+                    wrapper::vloadq(in_ptr + 12),
+                }
+            };
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+        }
+
+        // Left-overs loop
+        for(; x < window_end_x; ++x)
+        {
+            // Get bias and pointer to input
+            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+            int32_t    s_in   = *in_ptr;
+
+            // Accumulate bias
+            if(has_bias)
+            {
+                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                s_in += *bias_ptr;
+            }
+
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+        }
+    },
+    in, bi, out);
+}
+} // namespace
+
+void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                      const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(bias);
+    // Perform validation step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    _func                         = nullptr;
+    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
+    _result_shift                 = info.result_shift;
+    _result_offset_after_shift    = info.result_offset_after_shift;
+
+    // Auto-initialize output output if required
+    if(dst != nullptr)
+    {
+        // Work out expected output data type
+        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
+        // Output tensor auto initialization if not yet initialized
+        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
+    }
+
+    Window win = calculate_max_window(*src, Steps());
+
+    ICpuKernel::configure(win);
+
+    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+
+    // Set appropriate function
+    if(src->data_layout() == DataLayout::NCHW)
+    {
+        switch(src->data_type())
+        {
+            case DataType::S32:
+            {
+                if(is_qasymm8_signed)
+                {
+                    _func = &output_stage_nchw<int8_t>;
+                }
+                else
+                {
+                    _func = &output_stage_nchw<uint8_t>;
+                }
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = &output_stage_nchw<float16_t>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+            {
+                _func = &output_stage_nchw<float>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
+        }
+    }
+    else
+    {
+        switch(src->data_type())
+        {
+            case DataType::S32:
+            {
+                if(is_qasymm8_signed)
+                {
+                    _func = &output_stage_nhwc<int8_t>;
+                }
+                else
+                {
+                    _func = &output_stage_nhwc<uint8_t>;
+                }
+                break;
+            }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+            case DataType::F16:
+            {
+                _func = &output_stage_nhwc<float16_t>;
+                break;
+            }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::F32:
+            {
+                _func = &output_stage_nhwc<float>;
+                break;
+            }
+            default:
+            {
+                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
+            }
+        }
+    }
+}
+
+Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                       const DirectConvolutionLayerOutputStageKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+    return Status{};
+}
+
+void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
+
+    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
+    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
+}
+
+const char *CpuDirectConvolutionOutputStageKernel::name() const
+{
+    return "CpuDirectConvolutionOutputStageKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp
deleted file mode 100644
index 5f7a574e5a..0000000000
--- a/src/core/cpu/kernels/CpuDirectConvolutionStageKernel.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace kernels
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                          const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
-    }
-
-    if(src->data_type() == DataType::S32)
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
-    }
-
-    // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
-    {
-        if(is_data_type_float(src->data_type()))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-    }
-    else if(src->data_type() == DataType::S32)
-    {
-        // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
-    }
-
-    return Status{};
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    ARM_COMPUTE_ERROR_ON(src->info()->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
-
-    },
-    in, out);
-}
-
-template <typename T>
-typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
-    ARM_COMPUTE_UNUSED(result_shift);
-    ARM_COMPUTE_UNUSED(result_offset_after_shift);
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
-}
-
-// Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
-                {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
-}
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
-{
-    const bool has_bias = bias != nullptr;
-    using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
-    using TagType       = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;
-
-    const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
-
-    const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
-    const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
-
-    Window window_bias = window;
-    window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
-    window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
-    window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
-    window_bias.set(3, Window::Dimension(0, 0, 0));
-
-    const int window_start_x = window.x().start();
-    const int window_end_x   = window.x().end();
-    const int window_step_x  = 16 / src->info()->element_size();
-    Window    win            = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(src, win);
-    Iterator bi(bias, window_bias);
-    Iterator out(dst, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
-            {
-                {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
-                }
-            };
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
-}
-} // namespace
-
-void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                      const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(bias);
-    // Perform validation step
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
-
-    _func                         = nullptr;
-    _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
-    _result_shift                 = info.result_shift;
-    _result_offset_after_shift    = info.result_offset_after_shift;
-
-    // Auto-initialize output output if required
-    if(dst != nullptr)
-    {
-        // Work out expected output data type
-        const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
-        // Output tensor auto initialization if not yet initialized
-        auto_init_if_empty(*dst, src->clone()->set_data_type(output_dt));
-    }
-
-    Window win = calculate_max_window(*src, Steps());
-
-    ICpuKernel::configure(win);
-
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
-
-    // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nchw<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nchw<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nchw<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nchw<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-    else
-    {
-        switch(src->data_type())
-        {
-            case DataType::S32:
-            {
-                if(is_qasymm8_signed)
-                {
-                    _func = &output_stage_nhwc<int8_t>;
-                }
-                else
-                {
-                    _func = &output_stage_nhwc<uint8_t>;
-                }
-                break;
-            }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            case DataType::F16:
-            {
-                _func = &output_stage_nhwc<float16_t>;
-                break;
-            }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-            case DataType::F32:
-            {
-                _func = &output_stage_nhwc<float>;
-                break;
-            }
-            default:
-            {
-                ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
-            }
-        }
-    }
-}
-
-Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                       const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
-    return Status{};
-}
-
-void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    auto src  = tensors.get_tensor(TensorType::ACL_SRC_0);
-    auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
-
-    (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
-}
-
-const char *CpuDirectConvolutionOutputStageKernel::name() const
-{
-    return "CpuDirectConvolutionOutputStageKernel";
-}
-} // namespace kernels
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/fp16.cpp b/src/core/cpu/kernels/activation/NEON/fp16.cpp
deleted file mode 100644
index 6f2d5d8533..0000000000
--- a/src/core/cpu/kernels/activation/NEON/fp16.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/NEMath.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
-{
-    auto int_in = vreinterpretq_u16_f16(in);
-    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
-    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
-
-    constexpr int window_step_x  = 8;
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
-#endif /* __aarch64__ */
-
-    const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-    const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float16_t>(act_info.a());
-    const auto b  = static_cast<float16_t>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
-            float16_t       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/activation/NEON/fp32.cpp b/src/core/cpu/kernels/activation/NEON/fp32.cpp
deleted file mode 100644
index 54301d45ad..0000000000
--- a/src/core/cpu/kernels/activation/NEON/fp32.cpp
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-#ifndef __aarch64__
-inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
-{
-    auto int_in = vreinterpretq_u32_f32(in);
-    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
-}
-#endif /* __aarch64__ */
-} // namespace
-
-void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
-
-    constexpr int                                 window_step_x  = 4;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    // In case of non-aarch64, a small delta value is added to the input
-    // to prevent NAN values caused by zeros in inputs to SQRT.
-    // In case of aarh64, we call vsqrt directly, so we don't use delta.
-#ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
-#endif /* __aarch64__ */
-    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
-    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
-    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
-    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
-    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
-
-    constexpr float soft_relu_thresh  = 12.f;
-    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
-
-    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
-    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
-    const auto a  = static_cast<float>(act_info.a());
-    const auto b  = static_cast<float>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-#ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
-#else  /* __aarch64__ */
-                    {
-                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
-                    }
-#endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
-            float       tmp;
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<float>(static_cast<float>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<float>(a, std::max<float>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/qasymm8.cpp b/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
deleted file mode 100644
index a1217435b6..0000000000
--- a/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 16;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const qasymm8x16_t            va       = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
-    const qasymm8x16_t            vb       = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
-    const qasymm8_t               a        = quantize_qasymm8(act_info.a(), qi_in);
-    const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
-    const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
-    const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
-    const auto vconst_0_f32 = vdupq_n_f32(0);
-#endif // __aarch64__
-    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
-    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
-    const float       a_f32           = act_info.a();
-    const float       b_f32           = act_info.b();
-    const auto        const_6_f32     = vdupq_n_f32(6.f);
-    const auto        const_0_f32     = vdupq_n_f32(0.f);
-    const auto        const_3_f32     = vdupq_n_f32(3.f);
-    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgtz(vin_deq.val[0]),
-                        wrapper::vcgtz(vin_deq.val[1]),
-                        wrapper::vcgtz(vin_deq.val[2]),
-                        wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
-#else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
-#endif // __aarch64__
-
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
-                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
-                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
-                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
-
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
deleted file mode 100644
index 8b40bf8e72..0000000000
--- a/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 16;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const qasymm8x16_signed_t     va       = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
-    const qasymm8x16_signed_t     vb       = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
-    const qasymm8_signed_t        a        = quantize_qasymm8_signed(act_info.a(), qi_in);
-    const qasymm8_signed_t        b        = quantize_qasymm8_signed(act_info.b(), qi_in);
-    const qasymm8_signed_t        const_0  = quantize_qasymm8_signed(0.f, qi_in);
-    const qasymm8x16_signed_t     vconst_0 = vdupq_n_s8(const_0);
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-#ifndef __aarch64__
-    const auto vconst_0_f32 = vdupq_n_f32(1.f);
-#endif // __aarch64__
-    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
-    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
-    const float       a_f32           = act_info.a();
-    const float       b_f32           = act_info.b();
-    const auto        const_6_f32     = vdupq_n_f32(6.f);
-    const auto        const_0_f32     = vdupq_n_f32(0.f);
-    const auto        const_3_f32     = vdupq_n_f32(3.f);
-    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    float       s  = qi_in.scale / qi_out.scale;
-    float       o  = -qi_in.offset * s + qi_out.offset;
-    float32x4_t vs = vdupq_n_f32(s);
-    float32x4_t vo = vdupq_n_f32(o);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-
-#ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgtz(vin_deq.val[0]),
-                        wrapper::vcgtz(vin_deq.val[1]),
-                        wrapper::vcgtz(vin_deq.val[2]),
-                        wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
-#else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
-                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
-                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
-#endif // __aarch64__
-
-                const float32x4x4_t tmp_dep =
-                {
-                    {
-                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
-                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
-                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
-                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
-
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/NEON/qsymm16.cpp b/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
deleted file mode 100644
index 54b41820f2..0000000000
--- a/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/NESymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    constexpr int                                 window_step_x  = 8;
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = vdupq_n_f32(1.f);
-    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
-    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
-    const float                   a_f32    = act_info.a();
-    const float                   b_f32    = act_info.b();
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
-                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
-                {
-                    {
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
-                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
-            qsymm16_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/SVE/fp16.cpp b/src/core/cpu/kernels/activation/SVE/fp16.cpp
deleted file mode 100644
index bf31fd7d93..0000000000
--- a/src/core/cpu/kernels/activation/SVE/fp16.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE)
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const auto const_1     = svdup_n_f16(1.f);
-    const auto const_0     = svdup_n_f16(0.f);
-    const auto const_6     = svdup_n_f16(6.f);
-    const auto const_3     = svdup_n_f16(3.f);
-    const auto const_inv_6 = svdup_n_f16(0.166666667f);
-
-    const auto va = svdup_n_f16(act_info.a());
-    const auto vb = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        svfloat16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f16(pg, input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f16_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f16_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f16_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f16(pg, output_ptr + x, tmp);
-
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/SVE/fp32.cpp b/src/core/cpu/kernels/activation/SVE/fp32.cpp
deleted file mode 100644
index 75f9f8a4c3..0000000000
--- a/src/core/cpu/kernels/activation/SVE/fp32.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/SVEMath.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const auto const_1     = svdup_n_f32(1.f);
-    const auto const_0     = svdup_n_f32(0.f);
-    const auto const_6     = svdup_n_f32(6.f);
-    const auto const_3     = svdup_n_f32(3.f);
-    const auto const_inv_6 = svdup_n_f32(0.166666667f);
-
-    const auto va = svdup_n_f32(act_info.a());
-    const auto vb = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        svfloat32_t tmp;
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_f32(pg, input_ptr + x);
-            switch(act)
-            {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f32_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f32_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f32_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f32(pg, output_ptr + x, tmp);
-
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/SVE/qasymm8.cpp b/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
deleted file mode 100644
index 228b4ae530..0000000000
--- a/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const auto                    va              = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
-    const auto                    vb              = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
-    const auto                    const_0         = quantize_qasymm8(0.f, qi_in);
-    const auto                    vconst_0        = svdup_n_u8(const_0);
-    const auto                    vconst_1        = svdup_n_f32(1.f);
-    const auto                    va_f32          = svdup_n_f32(act_info.a());
-    const auto                    vb_f32          = svdup_n_f32(act_info.b());
-    const auto                    const_6_f32     = svdup_n_f32(6.f);
-    const auto                    const_0_f32     = svdup_n_f32(0.f);
-    const auto                    const_3_f32     = svdup_n_f32(3.f);
-    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
-    {
-        requant = false;
-    }
-    float s  = qi_in.scale / qi_out.scale;
-    float o  = -qi_in.offset * s + qi_out.offset;
-    auto  vs = svdup_n_f32(s);
-    auto  vo = svdup_n_f32(o);
-
-    // Initialise scale/offset for re-quantization with int32_t
-    const auto voffset_in = svdup_n_s32(qi_in.offset);
-    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_s32     = svdup_n_s32(s_s32);
-    const auto vo_s32     = svdup_n_s32(o_s32);
-
-    // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
-    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        svuint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_u8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_u8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 =
-                {
-                    { {
-                            svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
-                            svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
-                            svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
-                            svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
-                        }
-                    }
-                };
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
-                {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-                else
-                {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
-
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_u8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
deleted file mode 100644
index 989f825eb9..0000000000
--- a/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEAsymm.h"
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
-    const auto                    va              = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
-    const auto                    vb              = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
-    const auto                    const_0         = quantize_qasymm8_signed(0.f, qi_in);
-    const auto                    vconst_0        = svdup_n_s8(const_0);
-    const auto                    vconst_1        = svdup_n_f32(1.f);
-    const auto                    va_f32          = svdup_n_f32(act_info.a());
-    const auto                    vb_f32          = svdup_n_f32(act_info.b());
-    const auto                    const_6_f32     = svdup_n_f32(6.f);
-    const auto                    const_0_f32     = svdup_n_f32(0.f);
-    const auto                    const_3_f32     = svdup_n_f32(3.f);
-    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
-
-    // Initialise scale/offset for re-quantization
-    bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
-    {
-        requant = false;
-    }
-    float s  = qi_in.scale / qi_out.scale;
-    float o  = -qi_in.offset * s + qi_out.offset;
-    auto  vs = svdup_n_f32(s);
-    auto  vo = svdup_n_f32(o);
-
-    // Initialise scale/offset for re-quantization with int32_t
-    const auto voffset_in = svdup_n_s32(qi_in.offset);
-    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_s32     = svdup_n_s32(s_s32);
-    const auto vo_s32     = svdup_n_s32(o_s32);
-
-    // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
-    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-        svint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_s8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 =
-                {
-                    { {
-                            svmovlb_s32(svmovlb_s16(vin)),
-                            svmovlt_s32(svmovlb_s16(vin)),
-                            svmovlb_s32(svmovlt_s16(vin)),
-                            svmovlt_s32(svmovlt_s16(vin)),
-                        }
-                    }
-                };
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
-                {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-                else
-                {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
-                }
-
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
-
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
-
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/SVE/qsymm16.cpp b/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
deleted file mode 100644
index 66974875da..0000000000
--- a/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
-
-#include <cmath>
-#include <cstddef>
-
-#if defined(__ARM_FEATURE_SVE2)
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/SVESymm.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
-{
-    const auto                                    window_start_x = static_cast<int>(window.x().start());
-    const auto                                    window_end_x   = static_cast<int>(window.x().end());
-    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-
-    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
-    const auto                    vconst_1 = svdup_n_f32(1.f);
-    const auto                    va_f32   = svdup_n_f32(act_info.a());
-    const auto                    vb_f32   = svdup_n_f32(act_info.b());
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        svint16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
-        {
-            const auto vin = svld1_s16(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep =
-                {
-                    { {
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
-                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep =
-                {
-                    { {
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
-                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
-                        }
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s16(pg, output_ptr + x, tmp);
-
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/neon/fp16.cpp b/src/core/cpu/kernels/activation/neon/fp16.cpp
new file mode 100644
index 0000000000..6f2d5d8533
--- /dev/null
+++ b/src/core/cpu/kernels/activation/neon/fp16.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/NEON/NEMath.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+#ifndef __aarch64__
+inline float16x8_t mask_float_vector(const float16x8_t &in, const uint16x8_t &mask)
+{
+    auto int_in = vreinterpretq_u16_f16(in);
+    return vreinterpretq_f16_u16(wrapper::vand(int_in, mask));
+}
+#endif /* __aarch64__ */
+} // namespace
+
+void fp16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType                                = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
+    const ActivationLayerInfo::ActivationFunction act = act_info.activation();
+
+    constexpr int window_step_x  = 8;
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<float16_t>((1e-7), ExactTagType {}));
+#endif /* __aarch64__ */
+
+    const auto const_1     = wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+    const auto const_0     = wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+    const auto const_6     = wrapper::vdup_n(static_cast<float16_t>(6.f), ExactTagType{});
+    const auto const_3     = wrapper::vdup_n(static_cast<float16_t>(3.f), ExactTagType{});
+    const auto const_inv_6 = wrapper::vdup_n(static_cast<float16_t>(0.166666667f), ExactTagType{});
+
+    constexpr float soft_relu_thresh  = 12.f;
+    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float16_t>(soft_relu_thresh), ExactTagType{});
+
+    const auto va = wrapper::vdup_n(static_cast<float16_t>(act_info.a()), ExactTagType{});
+    const auto vb = wrapper::vdup_n(static_cast<float16_t>(act_info.b()), ExactTagType{});
+    const auto a  = static_cast<float16_t>(act_info.a());
+    const auto b  = static_cast<float16_t>(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<float16_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                    tmp = wrapper::vsqrt(vin);
+#else  /* __aarch64__ */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0, ExactTagType{}));
+                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* __aarch64__ */
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const float16_t in = *(reinterpret_cast<const float16_t *>(input_ptr + x));
+            float16_t       tmp;
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = static_cast<float16_t>(1) / (static_cast<float16_t>(1) + std::exp(-in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = std::max<float16_t>(static_cast<float16_t>(0), in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<float16_t>(a, std::max(static_cast<float16_t>(0), in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<float16_t>(a, std::max<float16_t>(b, in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float16_t>(1) + std::exp(in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/activation/neon/fp32.cpp b/src/core/cpu/kernels/activation/neon/fp32.cpp
new file mode 100644
index 0000000000..54301d45ad
--- /dev/null
+++ b/src/core/cpu/kernels/activation/neon/fp32.cpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+#ifndef __aarch64__
+inline float32x4_t mask_float_vector(const float32x4_t &in, const uint32x4_t &mask)
+{
+    auto int_in = vreinterpretq_u32_f32(in);
+    return vreinterpretq_f32_u32(wrapper::vand(int_in, mask));
+}
+#endif /* __aarch64__ */
+} // namespace
+
+void fp32_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
+
+    constexpr int                                 window_step_x  = 4;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    // In case of non-aarch64, a small delta value is added to the input
+    // to prevent NAN values caused by zeros in inputs to SQRT.
+    // In case of aarh64, we call vsqrt directly, so we don't use delta.
+#ifndef __aarch64__
+    const auto delta = wrapper::vdup_n(static_cast<float>(1e-24), ExactTagType {});
+#endif /* __aarch64__ */
+    const auto const_1     = wrapper::vdup_n(static_cast<float>(1.f), ExactTagType {});
+    const auto const_0     = wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+    const auto const_6     = wrapper::vdup_n(static_cast<float>(6.f), ExactTagType{});
+    const auto const_3     = wrapper::vdup_n(static_cast<float>(3.f), ExactTagType{});
+    const auto const_inv_6 = wrapper::vdup_n(static_cast<float>(0.166666667f), ExactTagType{});
+
+    constexpr float soft_relu_thresh  = 12.f;
+    const auto      vsoft_relu_thresh = wrapper::vdup_n(static_cast<float>(soft_relu_thresh), ExactTagType{});
+
+    const auto va = wrapper::vdup_n(static_cast<float>(act_info.a()), ExactTagType{});
+    const auto vb = wrapper::vdup_n(static_cast<float>(act_info.b()), ExactTagType{});
+    const auto a  = static_cast<float>(act_info.a());
+    const auto b  = static_cast<float>(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<float, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = wrapper::vabs(vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = wrapper::vmla(vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = wrapper::vmax(const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+#ifdef __aarch64__
+                    tmp = wrapper::vsqrt(vin);
+#else  /* __aarch64__ */
+                    {
+                        const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
+                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                    }
+#endif /* __aarch64__ */
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = wrapper::vmul(vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            const float in = *(reinterpret_cast<const float *>(input_ptr + x));
+            float       tmp;
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = std::abs(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = a * in + b;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = static_cast<float>(1) / (static_cast<float>(1) + std::exp(-in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = std::max<float>(static_cast<float>(0), in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = std::min<float>(a, std::max(static_cast<float>(0), in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = std::min<float>(a, std::max<float>(b, in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = (in > 0) ? in : a * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<float>(1) + std::exp(in));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = std::sqrt(in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = in * in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = a * std::tanh(b * in);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = in;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8.cpp b/src/core/cpu/kernels/activation/neon/qasymm8.cpp
new file mode 100644
index 0000000000..a1217435b6
--- /dev/null
+++ b/src/core/cpu/kernels/activation/neon/qasymm8.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const qasymm8x16_t            va       = vdupq_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const qasymm8x16_t            vb       = vdupq_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const qasymm8_t               a        = quantize_qasymm8(act_info.a(), qi_in);
+    const qasymm8_t               b        = quantize_qasymm8(act_info.b(), qi_in);
+    const qasymm8_t               const_0  = quantize_qasymm8(0.f, qi_in);
+    const qasymm8x16_t            vconst_0 = vdupq_n_u8(const_0);
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+#ifndef __aarch64__
+    const auto vconst_0_f32 = vdupq_n_f32(0);
+#endif // __aarch64__
+    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
+    const float       a_f32           = act_info.a();
+    const float       b_f32           = act_info.b();
+    const auto        const_6_f32     = vdupq_n_f32(6.f);
+    const auto        const_0_f32     = vdupq_n_f32(0.f);
+    const auto        const_3_f32     = vdupq_n_f32(3.f);
+    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = vmaxq_u8(vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgtz(vin_deq.val[0]),
+                        wrapper::vcgtz(vin_deq.val[1]),
+                        wrapper::vcgtz(vin_deq.val[2]),
+                        wrapper::vcgtz(vin_deq.val[3]),
+                    }
+                };
+#else  // __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+                    }
+                };
+#endif // __aarch64__
+
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+                    }
+                };
+
+                tmp = vquantize(tmp_dep, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+            qasymm8_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = utility::clamp<int32_t, qasymm8_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                float tmp_f = dequantize_qasymm8(in, qi_in);
+                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                tmp         = quantize_qasymm8(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
new file mode 100644
index 0000000000..8b40bf8e72
--- /dev/null
+++ b/src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_signed_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 16;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const qasymm8x16_signed_t     va       = vdupq_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const qasymm8x16_signed_t     vb       = vdupq_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const qasymm8_signed_t        a        = quantize_qasymm8_signed(act_info.a(), qi_in);
+    const qasymm8_signed_t        b        = quantize_qasymm8_signed(act_info.b(), qi_in);
+    const qasymm8_signed_t        const_0  = quantize_qasymm8_signed(0.f, qi_in);
+    const qasymm8x16_signed_t     vconst_0 = vdupq_n_s8(const_0);
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+#ifndef __aarch64__
+    const auto vconst_0_f32 = vdupq_n_f32(1.f);
+#endif // __aarch64__
+    const float32x4_t va_f32          = vdupq_n_f32(act_info.a());
+    const float32x4_t vb_f32          = vdupq_n_f32(act_info.b());
+    const float       a_f32           = act_info.a();
+    const float       b_f32           = act_info.b();
+    const auto        const_6_f32     = vdupq_n_f32(6.f);
+    const auto        const_0_f32     = vdupq_n_f32(0.f);
+    const auto        const_3_f32     = vdupq_n_f32(3.f);
+    const auto        const_inv_6_f32 = vdupq_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    float       s  = qi_in.scale / qi_out.scale;
+    float       o  = -qi_in.offset * s + qi_out.offset;
+    float32x4_t vs = vdupq_n_f32(s);
+    float32x4_t vo = vdupq_n_f32(o);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = vmaxq_s8(vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                // Re-quantize to new output space
+                tmp = vmlaq_qasymm8_signed(tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+                // Perform activation
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+#ifdef __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgtz(vin_deq.val[0]),
+                        wrapper::vcgtz(vin_deq.val[1]),
+                        wrapper::vcgtz(vin_deq.val[2]),
+                        wrapper::vcgtz(vin_deq.val[3]),
+                    }
+                };
+#else  // __aarch64__
+                const uint32x4x4_t pos_mask =
+                {
+                    {
+                        wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
+                        wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
+                    }
+                };
+#endif // __aarch64__
+
+                const float32x4x4_t tmp_dep =
+                {
+                    {
+                        wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
+                        wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
+                        wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
+                        wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
+                    }
+                };
+
+                tmp = vquantize_signed(tmp_dep, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+            qasymm8_signed_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                tmp = std::max(const_0, in);
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(const_0, in));
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                tmp = std::min(a, std::max(b, in));
+                tmp = utility::clamp<int32_t, qasymm8_signed_t>(tmp * s + o);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/neon/qsymm16.cpp b/src/core/cpu/kernels/activation/neon/qsymm16.cpp
new file mode 100644
index 0000000000..54b41820f2
--- /dev/null
+++ b/src/core/cpu/kernels/activation/neon/qsymm16.cpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/NESymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qsymm16_neon_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    constexpr int                                 window_step_x  = 8;
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = vdupq_n_f32(1.f);
+    const float32x4_t             va_f32   = vdupq_n_f32(act_info.a());
+    const float32x4_t             vb_f32   = vdupq_n_f32(act_info.b());
+    const float                   a_f32    = act_info.a();
+    const float                   b_f32    = act_info.b();
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+
+        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+        ARM_COMPUTE_UNUSED(tmp);
+
+        // Compute S elements per iteration
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin = wrapper::vloadq(input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                // Perform activation
+                const float32x4x2_t tmp_dep =
+                {
+                    {
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
+                        wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                // Perform activation
+                const float32x4x2_t tmp_dep =
+                {
+                    {
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
+                        wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            wrapper::vstore(output_ptr + x, tmp);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+            qsymm16_t tmp = 0;
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                tmp         = quantize_qsymm16(tmp_f, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                tmp         = quantize_qsymm16(tmp_f, qi_out);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            *(output_ptr + x) = tmp;
+        }
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp
new file mode 100644
index 0000000000..bf31fd7d93
--- /dev/null
+++ b/src/core/cpu/kernels/activation/sve/fp16.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE)
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const auto const_1     = svdup_n_f16(1.f);
+    const auto const_0     = svdup_n_f16(0.f);
+    const auto const_6     = svdup_n_f16(6.f);
+    const auto const_3     = svdup_n_f16(3.f);
+    const auto const_inv_6 = svdup_n_f16(0.166666667f);
+
+    const auto va = svdup_n_f16(act_info.a());
+    const auto vb = svdup_n_f16(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+        svfloat16_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b16(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_f16(pg, input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = svabs_f16_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = svmla_f16_z(pg, vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = svmax_f16_z(pg, const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = svsqrt_f16_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = svmul_f16_z(pg, vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            svst1_f16(pg, output_ptr + x, tmp);
+
+            x += svcnth();
+            pg = svwhilelt_b16(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b16(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // __ARM_FEATURE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp
new file mode 100644
index 0000000000..75f9f8a4c3
--- /dev/null
+++ b/src/core/cpu/kernels/activation/sve/fp32.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/SVEMath.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE)
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const auto const_1     = svdup_n_f32(1.f);
+    const auto const_0     = svdup_n_f32(0.f);
+    const auto const_6     = svdup_n_f32(6.f);
+    const auto const_3     = svdup_n_f32(3.f);
+    const auto const_inv_6 = svdup_n_f32(0.166666667f);
+
+    const auto va = svdup_n_f32(act_info.a());
+    const auto vb = svdup_n_f32(act_info.b());
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+        svfloat32_t tmp;
+
+        // Compute S elements per iteration
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b32(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_f32(pg, input_ptr + x);
+            switch(act)
+            {
+                case ActivationLayerInfo::ActivationFunction::ABS:
+                    tmp = svabs_f32_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LINEAR:
+                    tmp = svmla_f32_z(pg, vb, va, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::RELU:
+                    tmp = svmax_f32_z(pg, const_0, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                    tmp = svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::ELU:
+                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQRT:
+                    tmp = svsqrt_f32_z(pg, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::SQUARE:
+                    tmp = svmul_f32_z(pg, vin, vin);
+                    break;
+                case ActivationLayerInfo::ActivationFunction::TANH:
+                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+                    break;
+                case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                    tmp = vin;
+                    break;
+                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+            svst1_f32(pg, output_ptr + x, tmp);
+
+            x += svcntw();
+            pg = svwhilelt_b32(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b32(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif // __ARM_FEATURE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8.cpp b/src/core/cpu/kernels/activation/sve/qasymm8.cpp
new file mode 100644
index 0000000000..228b4ae530
--- /dev/null
+++ b/src/core/cpu/kernels/activation/sve/qasymm8.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const auto                    va              = svdup_n_u8(quantize_qasymm8(act_info.a(), qi_in));
+    const auto                    vb              = svdup_n_u8(quantize_qasymm8(act_info.b(), qi_in));
+    const auto                    const_0         = quantize_qasymm8(0.f, qi_in);
+    const auto                    vconst_0        = svdup_n_u8(const_0);
+    const auto                    vconst_1        = svdup_n_f32(1.f);
+    const auto                    va_f32          = svdup_n_f32(act_info.a());
+    const auto                    vb_f32          = svdup_n_f32(act_info.b());
+    const auto                    const_6_f32     = svdup_n_f32(6.f);
+    const auto                    const_0_f32     = svdup_n_f32(0.f);
+    const auto                    const_3_f32     = svdup_n_f32(3.f);
+    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    bool requant = true;
+    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    {
+        requant = false;
+    }
+    float s  = qi_in.scale / qi_out.scale;
+    float o  = -qi_in.offset * s + qi_out.offset;
+    auto  vs = svdup_n_f32(s);
+    auto  vo = svdup_n_f32(o);
+
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto voffset_in = svdup_n_s32(qi_in.offset);
+    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_s32     = svdup_n_s32(s_s32);
+    const auto vo_s32     = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+        svuint8_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b8(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_u8(pg, input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = svmax_u8_z(pg, vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+                // Re-quantize to new output space
+                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                svbool_t    p0, p1, p2, p3;
+                svint32x4_t tmp_dep;
+
+                // Expand to int32
+                const svint32x4_t vin_s32 =
+                {
+                    { {
+                            svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                            svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                            svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                            svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))),
+                        }
+                    }
+                };
+
+                // Compare elements to input offset
+                if(qi_in.scale >= 0)
+                {
+                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+                else
+                {
+                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+
+                // Multiply negative elements and requantize if necessary
+                if(requant)
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                }
+                else
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                }
+
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+
+            svst1_u8(pg, output_ptr + x, tmp);
+
+            x += svcntb();
+            pg = svwhilelt_b8(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b8(), pg));
+
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
new file mode 100644
index 0000000000..989f825eb9
--- /dev/null
+++ b/src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEAsymm.h"
+#include "src/core/NEON/SVEMath.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qasymm8_signed_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in           = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out          = dst->info()->quantization_info().uniform();
+    const auto                    va              = svdup_n_s8(quantize_qasymm8_signed(act_info.a(), qi_in));
+    const auto                    vb              = svdup_n_s8(quantize_qasymm8_signed(act_info.b(), qi_in));
+    const auto                    const_0         = quantize_qasymm8_signed(0.f, qi_in);
+    const auto                    vconst_0        = svdup_n_s8(const_0);
+    const auto                    vconst_1        = svdup_n_f32(1.f);
+    const auto                    va_f32          = svdup_n_f32(act_info.a());
+    const auto                    vb_f32          = svdup_n_f32(act_info.b());
+    const auto                    const_6_f32     = svdup_n_f32(6.f);
+    const auto                    const_0_f32     = svdup_n_f32(0.f);
+    const auto                    const_3_f32     = svdup_n_f32(3.f);
+    const auto                    const_inv_6_f32 = svdup_n_f32(0.166666667f);
+
+    // Initialise scale/offset for re-quantization
+    bool requant = true;
+    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    {
+        requant = false;
+    }
+    float s  = qi_in.scale / qi_out.scale;
+    float o  = -qi_in.offset * s + qi_out.offset;
+    auto  vs = svdup_n_f32(s);
+    auto  vo = svdup_n_f32(o);
+
+    // Initialise scale/offset for re-quantization with int32_t
+    const auto voffset_in = svdup_n_s32(qi_in.offset);
+    int32_t    s_s32      = round(s * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_s32      = round(o * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_s32     = svdup_n_s32(s_s32);
+    const auto vo_s32     = svdup_n_s32(o_s32);
+
+    // Initialise scale/offset for re-quantization for leaky relu
+    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
+    const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+        svint8_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b8(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_s8(pg, input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            {
+                // Perform activation
+                tmp = svmax_s8_z(pg, vconst_0, vin);
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+            {
+                // Perform activation
+                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+                // Re-quantize to new output space
+                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+            {
+                // De-quantize
+                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                // Perform activation
+                const svfloat32x4_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
+                            svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+            {
+                svbool_t    p0, p1, p2, p3;
+                svint32x4_t tmp_dep;
+
+                // Expand to int32
+                const svint32x4_t vin_s32 =
+                {
+                    { {
+                            svmovlb_s32(svmovlb_s16(vin)),
+                            svmovlt_s32(svmovlb_s16(vin)),
+                            svmovlb_s32(svmovlt_s16(vin)),
+                            svmovlt_s32(svmovlt_s16(vin)),
+                        }
+                    }
+                };
+
+                // Compare elements to input offset
+                if(qi_in.scale >= 0)
+                {
+                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+                else
+                {
+                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                }
+
+                // Multiply negative elements and requantize if necessary
+                if(requant)
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
+                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
+                }
+                else
+                {
+                    tmp_dep = svcreate4_s32(
+                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                }
+
+                // Convert uint32 vectors to uint16 vectors (with saturation)
+                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                // convert uint16 vectors to uint8 vectors (with saturation)
+                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+
+            svst1_s8(pg, output_ptr + x, tmp);
+
+            x += svcntb();
+            pg = svwhilelt_b8(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b8(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/core/cpu/kernels/activation/sve/qsymm16.cpp b/src/core/cpu/kernels/activation/sve/qsymm16.cpp
new file mode 100644
index 0000000000..66974875da
--- /dev/null
+++ b/src/core/cpu/kernels/activation/sve/qsymm16.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/experimental/Types.h"
+
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_SVE2)
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/SVESymm.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+void qsymm16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+{
+    const auto                                    window_start_x = static_cast<int>(window.x().start());
+    const auto                                    window_end_x   = static_cast<int>(window.x().end());
+    const ActivationLayerInfo::ActivationFunction act            = act_info.activation();
+
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+
+    const UniformQuantizationInfo qi_in    = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qi_out   = dst->info()->quantization_info().uniform();
+    const auto                    vconst_1 = svdup_n_f32(1.f);
+    const auto                    va_f32   = svdup_n_f32(act_info.a());
+    const auto                    vb_f32   = svdup_n_f32(act_info.b());
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
+
+        svint16_t tmp;
+
+        int      x  = window_start_x;
+        svbool_t pg = svwhilelt_b16(x, window_end_x);
+        do
+        {
+            const auto vin = svld1_s16(pg, input_ptr + x);
+            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            {
+                // De-quantize
+                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                // Perform activation
+                const svfloat32x2_t tmp_dep =
+                {
+                    { {
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+            }
+            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            {
+                // De-quantize
+                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                // Perform activation
+                const svfloat32x2_t tmp_dep =
+                {
+                    { {
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))),
+                        }
+                    }
+                };
+                // Re-quantize to new output space
+                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+            }
+            else
+            {
+                ARM_COMPUTE_ERROR("Unsupported activation function");
+            }
+
+            svst1_s16(pg, output_ptr + x, tmp);
+
+            x += svcnth();
+            pg = svwhilelt_b16(x, window_end_x);
+
+        }
+        while(svptest_any(svptrue_b16(), pg));
+    },
+    input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_SVE2) */
diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp
index d62c1b6310..b9a6999f84 100644
--- a/src/cpu/CpuContext.cpp
+++ b/src/cpu/CpuContext.cpp
@@ -24,6 +24,7 @@
 #include "src/cpu/CpuContext.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+#include "src/cpu/CpuQueue.h"
 #include "src/cpu/CpuTensor.h"
 #include "src/runtime/CPUUtils.h"
 
@@ -196,5 +197,10 @@ ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool alloc
     }
     return tensor;
 }
+
+IQueue *CpuContext::create_queue(const AclQueueOptions *options)
+{
+    return new CpuQueue(this, options);
+}
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h
index d2062e4bdd..e909767a7b 100644
--- a/src/cpu/CpuContext.h
+++ b/src/cpu/CpuContext.h
@@ -69,6 +69,7 @@ public:
 
     // Inherrited methods overridden
     ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue *create_queue(const AclQueueOptions *options) override;
 
 private:
     AllocatorWrapper _allocator;
diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp
new file mode 100644
index 0000000000..0f0097b3f4
--- /dev/null
+++ b/src/cpu/CpuQueue.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/CpuQueue.h"
+
+#include "arm_compute/runtime/Scheduler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options)
+    : IQueue(ctx)
+{
+    ARM_COMPUTE_UNUSED(options);
+}
+
+arm_compute::IScheduler &CpuQueue::scheduler()
+{
+    return arm_compute::Scheduler::get();
+}
+
+StatusCode CpuQueue::finish()
+{
+    return StatusCode::Success;
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h
new file mode 100644
index 0000000000..871a36c85b
--- /dev/null
+++ b/src/cpu/CpuQueue.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CPU_CPUQUEUE_H
+#define SRC_CPU_CPUQUEUE_H
+
+#include "src/common/IQueue.h"
+
+#include "arm_compute/runtime/IScheduler.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** CPU queue implementation class */
+class CpuQueue final : public IQueue
+{
+public:
+    /** Construct a new CpuQueue object
+     *
+     * @param[in] ctx     Context to be used
+     * @param[in] options Command queue options
+     */
+    CpuQueue(IContext *ctx, const AclQueueOptions *options);
+    /** Return legacy scheduler
+     *
+     * @return arm_compute::IScheduler&
+     */
+    arm_compute::IScheduler &scheduler();
+
+    // Inherited functions overridden
+    StatusCode finish() override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* SRC_CPU_CPUQUEUE_H */
diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp
index 2e04e1d593..d8ef18e62e 100644
--- a/src/gpu/cl/ClContext.cpp
+++ b/src/gpu/cl/ClContext.cpp
@@ -23,8 +23,11 @@
  */
 #include "src/gpu/cl/ClContext.h"
 
+#include "src/gpu/cl/ClQueue.h"
 #include "src/gpu/cl/ClTensor.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
 namespace arm_compute
 {
 namespace gpu
@@ -49,12 +52,15 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
 ClContext::ClContext(const AclContextOptions *options)
     : IContext(Target::GpuOcl),
       _mlgo_heuristics(),
-      _cl_context()
+      _cl_ctx(),
+      _cl_dev()
 {
     if(options != nullptr)
     {
         _mlgo_heuristics = populate_mlgo(options->kernel_config_file);
     }
+    _cl_ctx = CLKernelLibrary::get().context();
+    _cl_dev = CLKernelLibrary::get().get_device();
 }
 
 const mlgo::MLGOHeuristics &ClContext::mlgo() const
@@ -64,14 +70,20 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const
 
 ::cl::Context ClContext::cl_ctx()
 {
-    return _cl_context;
+    return _cl_ctx;
+}
+
+::cl::Device ClContext::cl_dev()
+{
+    return _cl_dev;
 }
 
 bool ClContext::set_cl_ctx(::cl::Context ctx)
 {
     if(this->refcount() == 0)
     {
-        _cl_context = ctx;
+        _cl_ctx = ctx;
+        CLScheduler::get().set_context(ctx);
         return true;
     }
     return false;
@@ -86,6 +98,11 @@ ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool alloca
     }
     return tensor;
 }
+
+IQueue *ClContext::create_queue(const AclQueueOptions *options)
+{
+    return new ClQueue(this, options);
+}
 } // namespace opencl
 } // namespace gpu
 } // namespace arm_compute
diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h
index dd6699a0c9..2a0d4ee1c8 100644
--- a/src/gpu/cl/ClContext.h
+++ b/src/gpu/cl/ClContext.h
@@ -44,6 +44,7 @@ public:
      * @param[in] options Creational options
      */
     explicit ClContext(const AclContextOptions *options);
+
     /** Extract MLGO heuristics
      *
      * @return Heuristics tree
@@ -55,6 +56,13 @@ public:
      * @return the cl context used
      */
     ::cl::Context cl_ctx();
+
+    /** Underlying cl device accessor
+     *
+     * @return the cl device used
+     */
+    ::cl::Device cl_dev();
+
     /** Update/inject an underlying cl context object
      *
      * @warning Context will be able to set if the object doesn't have any pending reference to other objects
@@ -67,10 +75,12 @@ public:
 
     // Inherrited methods overridden
     ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue *create_queue(const AclQueueOptions *options) override;
 
 private:
     mlgo::MLGOHeuristics _mlgo_heuristics;
-    ::cl::Context        _cl_context;
+    ::cl::Context        _cl_ctx;
+    ::cl::Device         _cl_dev;
 };
 } // namespace opencl
 } // namespace gpu
diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp
new file mode 100644
index 0000000000..2123adcf39
--- /dev/null
+++ b/src/gpu/cl/ClQueue.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/ClQueue.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+namespace arm_compute
+{
+namespace gpu
+{
+namespace opencl
+{
+namespace
+{
+CLTunerMode map_tuner_mode(AclTuningMode mode)
+{
+    switch(mode)
+    {
+        case AclRapid:
+            return CLTunerMode::RAPID;
+            break;
+        case AclNormal:
+            return CLTunerMode::NORMAL;
+            break;
+        case AclExhaustive:
+            return CLTunerMode::EXHAUSTIVE;
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Invalid tuner mode");
+            break;
+    }
+}
+
+std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
+{
+    if(options == nullptr || options->mode == AclTuningModeNone)
+    {
+        return nullptr;
+    }
+
+    CLTuningInfo tune_info;
+    tune_info.tuner_mode = map_tuner_mode(options->mode);
+    tune_info.tune_wbsm  = false;
+
+    return std::make_unique<CLTuner>(true /* tune_new_kernels */, tune_info);
+}
+} // namespace
+
+ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options)
+    : IQueue(ctx), _tuner(nullptr)
+{
+    _tuner = populate_tuner(options);
+}
+
+arm_compute::CLScheduler &ClQueue::scheduler()
+{
+    return arm_compute::CLScheduler::get();
+}
+
+::cl::CommandQueue ClQueue::cl_queue()
+{
+    return arm_compute::CLScheduler::get().queue();
+}
+
+bool ClQueue::set_cl_queue(::cl::CommandQueue queue)
+{
+    // TODO: Check queue is from the same context
+    arm_compute::CLScheduler::get().set_queue(queue);
+    return true;
+}
+
+StatusCode ClQueue::finish()
+{
+    arm_compute::CLScheduler::get().queue().finish();
+    return StatusCode::Success;
+}
+
+} // namespace opencl
+} // namespace gpu
+} // namespace arm_compute
diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h
new file mode 100644
index 0000000000..b16a0f4e83
--- /dev/null
+++ b/src/gpu/cl/ClQueue.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_GPU_CLQUEUE_H
+#define SRC_GPU_CLQUEUE_H
+
+#include "src/common/IQueue.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class CLTuner;
+
+namespace gpu
+{
+namespace opencl
+{
+/** OpenCL queue implementation class */
+class ClQueue final : public IQueue
+{
+public:
+    /** Construct a new CpuQueue object
+     *
+     * @param[in] ctx     Context to be used
+     * @param[in] options Command queue options
+     */
+    ClQueue(IContext *ctx, const AclQueueOptions *options);
+
+    /** Return legacy scheduler
+     *
+     * @return arm_compute::IScheduler&
+     */
+    arm_compute::CLScheduler &scheduler();
+
+    /** Underlying cl command queue accessor
+     *
+     * @return the cl command queue used
+     */
+    ::cl::CommandQueue cl_queue();
+
+    /** Update/inject an underlying cl command queue object
+     *
+     * @warning Command queue needs to come from the same context as the AclQueue
+     *
+     * @param[in] queue Underlying cl command queue to be used
+     *
+     * @return true if the queue was set successfully else falseS
+     */
+    bool set_cl_queue(::cl::CommandQueue queue);
+
+    // Inherited functions overridden
+    StatusCode finish() override;
+
+private:
+    std::unique_ptr<CLTuner> _tuner;
+};
+} // namespace opencl
+} // namespace gpu
+} // namespace arm_compute
+#endif /* SRC_GPU_CLQUEUE_H */
diff --git a/tests/framework/Macros.h b/tests/framework/Macros.h
index 23c826657d..a6ba137b59 100644
--- a/tests/framework/Macros.h
+++ b/tests/framework/Macros.h
@@ -224,6 +224,11 @@
 #define DISABLED_FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE) \
     FIXTURE_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, arm_compute::test::framework::TestCaseFactory::Status::DISABLED)
 
+#define EMPTY_BODY_FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE) \
+    FIXTURE_TEST_CASE(TEST_NAME, FIXTURE, MODE)                \
+    {                                                          \
+    }
+
 #define FIXTURE_DATA_TEST_CASE_IMPL(TEST_NAME, FIXTURE, MODE, STATUS, DATASET)                                                      \
     template <typename T>                                                                                                           \
     class TEST_NAME;                                                                                                                \
diff --git a/tests/validation/cpu/unit/Context.cpp b/tests/validation/cpu/unit/Context.cpp
index 519a7bee5f..57ca866032 100644
--- a/tests/validation/cpu/unit/Context.cpp
+++ b/tests/validation/cpu/unit/Context.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/validation/fixtures/UNIT/Context.h"
+#include "tests/validation/fixtures/UNIT/ContextFixture.h"
 
 #include "src/cpu/CpuContext.h"
 
@@ -74,18 +74,10 @@ TEST_CASE(CreateContextWithInvalidOptions, framework::DatasetMode::ALL)
     ARM_COMPUTE_ASSERT(ctx == nullptr);
 }
 
-FIXTURE_TEST_CASE(DestroyInvalidContext, DestroyInvalidContextFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(SimpleContextCApi, SimpleContextCApiFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(SimpleContextCppApi, SimpleContextCppApiFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(MultipleContexts, MultipleContextsFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
-{
-}
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidContext, DestroyInvalidContextFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCApi, SimpleContextCApiFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCppApi, SimpleContextCppApiFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleContexts, MultipleContextsFixture<AclTarget::AclCpu>, framework::DatasetMode::ALL)
 
 /** Test-case for CpuCapabilities
  *
diff --git a/tests/validation/cpu/unit/Queue.cpp b/tests/validation/cpu/unit/Queue.cpp
new file mode 100644
index 0000000000..7d977cc48e
--- /dev/null
+++ b/tests/validation/cpu/unit/Queue.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/fixtures/UNIT/QueueFixture.h"
+
+#include "src/cpu/CpuQueue.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CPU)
+TEST_SUITE(UNIT)
+TEST_SUITE(Queue)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueueWithInvalidContext, CreateQueueWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueuerWithInvalidOptions, CreateQueuerWithInvalidOptionsFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidQueue, DestroyInvalidQueueFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleQueue, SimpleQueueFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+
+TEST_SUITE_END() // Queue
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CPU
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/cpu/unit/Tensor.cpp b/tests/validation/cpu/unit/Tensor.cpp
index aa2e3abdf1..cc0c55758f 100644
--- a/tests/validation/cpu/unit/Tensor.cpp
+++ b/tests/validation/cpu/unit/Tensor.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/validation/fixtures/UNIT/Tensor.h"
+#include "tests/validation/fixtures/UNIT/TensorFixture.h"
 
 namespace arm_compute
 {
@@ -33,26 +33,19 @@ TEST_SUITE(CPU)
 TEST_SUITE(UNIT)
 TEST_SUITE(Tensor)
 
-#define TENSOR_TESE_CASE(name, fixture)                           \
-    FIXTURE_TEST_CASE(name, fixture, framework::DatasetMode::ALL) \
-    {                                                             \
-    }
-
-TENSOR_TESE_CASE(CreateTensorWithInvalidContext, CreateTensorWithInvalidContextFixture)
-TENSOR_TESE_CASE(CreateTensorWithInvalidDescriptor, CreateTensorWithInvalidDescriptorFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(DestroyInvalidTensor, DestroyInvalidTensorFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(SimpleTensor, SimpleTensorFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(TensorStress, TensorStressFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(MapInvalidTensor, MapInvalidTensorFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(MapNotAllocatedTensor, MapNotAllocatedTensorFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(MapAllocatedTensor, MapAllocatedTensorFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(ImportMemory, ImportMemoryFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(GetSize, TensorSizeFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(GetInvalidSize, InvalidTensorSizeFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(GetDescriptor, DescriptorConversionFixture<acl::Target::Cpu>)
-TENSOR_TESE_CASE(GetInvalidDescriptor, InvalidDescriptorConversionFixture<acl::Target::Cpu>)
-
-#undef TENSOR_TEST_CASE
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidContext, CreateTensorWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidDescriptor, CreateTensorWithInvalidDescriptorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensor, DestroyInvalidTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensor, SimpleTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(TensorStress, TensorStressFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapInvalidTensor, MapInvalidTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapNotAllocatedTensor, MapNotAllocatedTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapAllocatedTensor, MapAllocatedTensorFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(ImportMemory, ImportMemoryFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetSize, TensorSizeFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidSize, InvalidTensorSizeFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetDescriptor, DescriptorConversionFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidDescriptor, InvalidDescriptorConversionFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
 
 TEST_SUITE_END() // Tensor
 TEST_SUITE_END() // UNIT
diff --git a/tests/validation/cpu/unit/TensorPack.cpp b/tests/validation/cpu/unit/TensorPack.cpp
index 5436ceb0c1..f019e8e3c4 100644
--- a/tests/validation/cpu/unit/TensorPack.cpp
+++ b/tests/validation/cpu/unit/TensorPack.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/validation/fixtures/UNIT/TensorPack.h"
+#include "tests/validation/fixtures/UNIT/TensorPackFixture.h"
 
 namespace arm_compute
 {
@@ -33,21 +33,11 @@ TEST_SUITE(CPU)
 TEST_SUITE(UNIT)
 TEST_SUITE(TensorPack)
 
-FIXTURE_TEST_CASE(CreateTensorPackWithInvalidContext, CreateTensorPackWithInvalidContextFixture, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(DestroyInvalidTensorPack, DestroyInvalidTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(AddInvalidObjectToTensorPack, AddInvalidObjectToTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(SimpleTensorPack, SimpleTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(MultipleTensorsInPack, MultipleTensorsInPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
-{
-}
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorPackWithInvalidContext, CreateTensorPackWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensorPack, DestroyInvalidTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(AddInvalidObjectToTensorPack, AddInvalidObjectToTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensorPack, SimpleTensorPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleTensorsInPack, MultipleTensorsInPackFixture<acl::Target::Cpu>, framework::DatasetMode::ALL)
 
 TEST_SUITE_END() // Tensor
 TEST_SUITE_END() // UNIT
diff --git a/tests/validation/fixtures/UNIT/Context.h b/tests/validation/fixtures/UNIT/Context.h
deleted file mode 100644
index afa49e00e0..0000000000
--- a/tests/validation/fixtures/UNIT/Context.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_UNIT_CONTEXT
-#define ARM_COMPUTE_TEST_UNIT_CONTEXT
-
-#include "arm_compute/Acl.hpp"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-/** Test-case for AclDestroyContext
- *
- * Validate that AclDestroyContext behaves as expected when invalid inputs as context are given
- *
- * Test Steps:
- *  - Call AclDestroyContext with null context
- *  - Confirm that AclInvalidArgument is reported
- *  - Call AclDestroyContext on empty array
- *  - Confirm that AclInvalidArgument is reported
- *  - Call AclDestroyContext on an ACL object other than AclContext
- *  - Confirm that AclInvalidArgument is reported
- *  - Confirm that context is still nullptr
- */
-template <AclTarget Target>
-class DestroyInvalidContextFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        AclContext ctx = nullptr;
-        std::array<char, 256> empty_array{};
-        AclContext valid_ctx = nullptr;
-        ARM_COMPUTE_ASSERT(AclCreateContext(&valid_ctx, Target, nullptr) == AclStatus::AclSuccess);
-        ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(AclDestroyContext(reinterpret_cast<AclContext>(empty_array.data())) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(ctx == nullptr);
-        ARM_COMPUTE_ASSERT(AclDestroyContext(valid_ctx) == AclStatus::AclSuccess);
-    };
-};
-
-/** Test-case for AclCreateContext and AclDestroyContext
- *
- * Validate that AclCreateContext can create and destroy a context through the C API
- *
- * Test Steps:
- *  - Call AclCreateContext with valid target
- *  - Confirm that context is not nullptr and error code is AclSuccess
- *  - Destroy context
- *  - Confirm that AclSuccess is reported
- */
-template <AclTarget Target>
-class SimpleContextCApiFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        AclContext ctx = nullptr;
-        ARM_COMPUTE_ASSERT(AclCreateContext(&ctx, Target, nullptr) == AclStatus::AclSuccess);
-        ARM_COMPUTE_ASSERT(ctx != nullptr);
-        ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclSuccess);
-    };
-};
-
-/** Test-case for Context from the C++ interface
- *
- * Test Steps:
- *  - Create a Context obejct
- *  - Confirm that StatusCode::Success is reported
- *  - Confirm that equality operator works
- *  - Confirm that inequality operator works
- */
-template <acl::Target Target>
-class SimpleContextCppApiFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode status = acl::StatusCode::Success;
-        acl::Context    ctx(Target, &status);
-        ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
-
-        auto ctx_eq = ctx;
-        ARM_COMPUTE_ASSERT(ctx_eq == ctx);
-
-        acl::Context ctx_ienq(Target, &status);
-        ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
-        ARM_COMPUTE_ASSERT(ctx_ienq != ctx);
-    };
-};
-
-/** Test-case for multiple contexes
- *
- * Validate that AclCreateContext can create/destroy multiple contexts with different options
- *
- * Test Steps:
- *  - Call AclCreateContext with different targets
- *  - Confirm that AclSuccess is reported
- *  - Destroy all contexts
- *  - Confirm that AclSuccess is reported
- */
-template <AclTarget Target>
-class MultipleContextsFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        const unsigned int num_tests = 5;
-        std::array<AclContext, num_tests> ctxs{};
-        for(unsigned int i = 0; i < num_tests; ++i)
-        {
-            ARM_COMPUTE_ASSERT(AclCreateContext(&ctxs[i], Target, nullptr) == AclStatus::AclSuccess);
-            ARM_COMPUTE_ASSERT(ctxs[i] != nullptr);
-            ARM_COMPUTE_ASSERT(AclDestroyContext(ctxs[i]) == AclStatus::AclSuccess);
-        }
-    };
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_UNIT_CONTEXT */
diff --git a/tests/validation/fixtures/UNIT/ContextFixture.h b/tests/validation/fixtures/UNIT/ContextFixture.h
new file mode 100644
index 0000000000..77cbc12320
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/ContextFixture.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_CONTEXT_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_CONTEXT_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test-case for AclDestroyContext
+ *
+ * Validate that AclDestroyContext behaves as expected when invalid inputs as context are given
+ *
+ * Test Steps:
+ *  - Call AclDestroyContext with null context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyContext on empty array
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyContext on an ACL object other than AclContext
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that context is still nullptr
+ */
+template <AclTarget Target>
+class DestroyInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclContext ctx = nullptr;
+        std::array<char, 256> empty_array{};
+        AclContext valid_ctx = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateContext(&valid_ctx, Target, nullptr) == AclStatus::AclSuccess);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(reinterpret_cast<AclContext>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(ctx == nullptr);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(valid_ctx) == AclStatus::AclSuccess);
+    };
+};
+
+/** Test-case for AclCreateContext and AclDestroyContext
+ *
+ * Validate that AclCreateContext can create and destroy a context through the C API
+ *
+ * Test Steps:
+ *  - Call AclCreateContext with valid target
+ *  - Confirm that context is not nullptr and error code is AclSuccess
+ *  - Destroy context
+ *  - Confirm that AclSuccess is reported
+ */
+template <AclTarget Target>
+class SimpleContextCApiFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclContext ctx = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateContext(&ctx, Target, nullptr) == AclStatus::AclSuccess);
+        ARM_COMPUTE_ASSERT(ctx != nullptr);
+        ARM_COMPUTE_ASSERT(AclDestroyContext(ctx) == AclStatus::AclSuccess);
+    };
+};
+
+/** Test-case for Context from the C++ interface
+ *
+ * Test Steps:
+ *  - Create a Context obejct
+ *  - Confirm that StatusCode::Success is reported
+ *  - Confirm that equality operator works
+ *  - Confirm that inequality operator works
+ */
+template <acl::Target Target>
+class SimpleContextCppApiFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode status = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &status);
+        ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
+
+        auto ctx_eq = ctx;
+        ARM_COMPUTE_ASSERT(ctx_eq == ctx);
+
+        acl::Context ctx_ienq(Target, &status);
+        ARM_COMPUTE_ASSERT(status == acl::StatusCode::Success);
+        ARM_COMPUTE_ASSERT(ctx_ienq != ctx);
+    };
+};
+
+/** Test-case for multiple contexes
+ *
+ * Validate that AclCreateContext can create/destroy multiple contexts with different options
+ *
+ * Test Steps:
+ *  - Call AclCreateContext with different targets
+ *  - Confirm that AclSuccess is reported
+ *  - Destroy all contexts
+ *  - Confirm that AclSuccess is reported
+ */
+template <AclTarget Target>
+class MultipleContextsFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        const unsigned int num_tests = 5;
+        std::array<AclContext, num_tests> ctxs{};
+        for(unsigned int i = 0; i < num_tests; ++i)
+        {
+            ARM_COMPUTE_ASSERT(AclCreateContext(&ctxs[i], Target, nullptr) == AclStatus::AclSuccess);
+            ARM_COMPUTE_ASSERT(ctxs[i] != nullptr);
+            ARM_COMPUTE_ASSERT(AclDestroyContext(ctxs[i]) == AclStatus::AclSuccess);
+        }
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_CONTEXT_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/QueueFixture.h b/tests/validation/fixtures/UNIT/QueueFixture.h
new file mode 100644
index 0000000000..bc93f5f120
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/QueueFixture.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_QUEUE_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_QUEUE_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test case for AclCreateQueue
+ *
+ * Validate that AclCreateQueue behaves as expected with invalid context
+ *
+ * Test Steps:
+ *  - Call AclCreateQueue with an invalid context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that the queue is still nullptr
+ */
+class CreateQueueWithInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclQueue queue = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateQueue(&queue, nullptr, nullptr) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(queue == nullptr);
+    };
+};
+
+/** Test-case for AclCreateQueue
+ *
+ * Validate that AclCreateQueue behaves as expected with invalid options
+ *
+ * Test Steps:
+ *  - Call AclCreateQueue with valid context but invalid options
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that queue is still nullptr
+ */
+template <acl::Target Target>
+class CreateQueuerWithInvalidOptionsFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        // Check invalid tuning mode
+        AclQueueOptions invalid_queue_opts;
+        invalid_queue_opts.mode = static_cast<AclTuningMode>(-1);
+
+        AclQueue queue = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateQueue(&queue, ctx.get(), &invalid_queue_opts) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(queue == nullptr);
+    };
+};
+
+/** Test case for AclDestroyQueue
+*
+* Validate that AclDestroyQueue behaves as expected when an invalid queue is given
+*
+* Test Steps:
+*  - Call AclDestroyQueue with null queue
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyQueue on empty array
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyQueue on an ACL object other than AclQueue
+*  - Confirm that AclInvalidArgument is reported
+*  - Confirm that queue is still nullptr
+*/
+template <acl::Target Target>
+class DestroyInvalidQueueFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        std::array<char, 256> empty_array{};
+        AclQueue queue = nullptr;
+
+        ARM_COMPUTE_ASSERT(AclDestroyQueue(queue) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyQueue(reinterpret_cast<AclQueue>(ctx.get())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyQueue(reinterpret_cast<AclQueue>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(queue == nullptr);
+    };
+};
+
+/** Test case for AclCreateQueue
+ *
+ * Validate that a queue can be created successfully
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid queue
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class SimpleQueueFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::Queue queue(ctx, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_QUEUE_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/Tensor.h b/tests/validation/fixtures/UNIT/Tensor.h
deleted file mode 100644
index 32260cb431..0000000000
--- a/tests/validation/fixtures/UNIT/Tensor.h
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_UNIT_TENSOR
-#define ARM_COMPUTE_TEST_UNIT_TENSOR
-
-#include "arm_compute/Acl.hpp"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-/** Test case for AclCreateTensor
- *
- * Validate that AclCreateTensor behaves as expected with invalid context
- *
- * Test Steps:
- *  - Call AclCreateTensor with an invalid context
- *  - Confirm that AclInvalidArgument is reported
- *  - Confirm that the tensor is still nullptr
- */
-class CreateTensorWithInvalidContextFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        AclTensor tensor = nullptr;
-        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, nullptr, nullptr, false) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(tensor == nullptr);
-    };
-};
-
-/** Test-case for AclCreateTensor
- *
- * Validate that AclCreateTensor behaves as expected on invalid descriptor
- *
- * Test Steps:
- *  - Call AclCreateTensor with valid context but invalid descriptor
- *  - Confirm that AclInvalidArgument is reported
- *  - Confirm that tensor is still nullptr
- */
-template <acl::Target Target>
-class CreateTensorWithInvalidDescriptorFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::Context ctx(Target);
-        AclTensor    tensor = nullptr;
-        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), nullptr, false) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(tensor == nullptr);
-
-        // Check invalid data type
-        AclTensorDescriptor invalid_desc;
-        invalid_desc.ndims     = 4;
-        invalid_desc.data_type = static_cast<AclDataType>(-1);
-        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), &invalid_desc, false) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(tensor == nullptr);
-
-        // Check invalid number of dimensions
-        invalid_desc.data_type = AclDataType::AclFloat32;
-        invalid_desc.ndims     = 15;
-        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), &invalid_desc, false) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(tensor == nullptr);
-    };
-};
-
-/** Test case for AclDestroyTensor
-*
-* Validate that AclDestroyTensor behaves as expected when an invalid tensor is given
-*
-* Test Steps:
-*  - Call AclDestroyTensor with null tensor
-*  - Confirm that AclInvalidArgument is reported
-*  - Call AclDestroyTensor on empty array
-*  - Confirm that AclInvalidArgument is reported
-*  - Call AclDestroyTensor on an ACL object other than AclTensor
-*  - Confirm that AclInvalidArgument is reported
-*  - Confirm that tensor is still nullptr
-*/
-template <acl::Target Target>
-class DestroyInvalidTensorFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::Context ctx(Target);
-
-        std::array<char, 256> empty_array{};
-        AclTensor tensor = nullptr;
-
-        ARM_COMPUTE_ASSERT(AclDestroyTensor(tensor) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(AclDestroyTensor(reinterpret_cast<AclTensor>(ctx.get())) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(AclDestroyTensor(reinterpret_cast<AclTensor>(empty_array.data())) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(tensor == nullptr);
-    };
-};
-
-/** Test case for AclCreateTensor
- *
- * Validate that a tensor can be created successfully
- *
- * Test Steps:
- *  - Create a valid context
- *  - Create a valid tensor
- *  - Confirm that AclSuccess is returned
- */
-template <acl::Target Target>
-class SimpleTensorFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-        acl::Context    ctx(Target, &err);
-
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-    };
-};
-
-/** Test case for AclTensor
- *
- * Validate that multiple tensors can be created successfully
- * Possibly stress the possibility of memory leaks
- *
- * Test Steps:
- *  - Create a valid context
- *  - Create a lot of tensors
- *  - Confirm that AclSuccess is returned
- */
-template <acl::Target Target>
-class TensorStressFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-
-        acl::Context ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        const unsigned int num_tensors = 1024;
-        for(unsigned int i = 0; i < num_tensors; ++i)
-        {
-            acl::Tensor tensor(ctx, acl::TensorDescriptor({ 1024, 1024 }, acl::DataType::Float32), &err);
-            ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-        }
-    };
-};
-
-/** Test case for AclMapTensor
- *
- * Validate that map on an invalid object fails
- *
- * Test Steps:
- *  - Create a valid context
- *  - Pass and invalid object for mapping
- *  - Confirm that AclInvalidArgument is returned
- */
-template <acl::Target Target>
-class MapInvalidTensorFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-
-        acl::Context ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        void *handle = nullptr;
-        ARM_COMPUTE_ASSERT(AclMapTensor(reinterpret_cast<AclTensor>(ctx.get()), &handle) == AclStatus::AclInvalidArgument);
-    };
-};
-
-/** Test case for AclMapTensor
- *
- * Validate that map of an unallocated pointer is nullptr
- *
- * Test Steps:
- *  - Create a valid context
- *  - Create a valid tensor without allocating
- *  - Map tensor
- *  - Check that mapping is nullptr
- */
-template <acl::Target Target>
-class MapNotAllocatedTensorFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-
-        acl::Context ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 8, 8 }, acl::DataType::Float32), false /* allocate */, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-        ARM_COMPUTE_ASSERT(tensor.map() == nullptr);
-    };
-};
-
-/** Test case for AclMapTensor
- *
- * Validate that map of a valid tensor return a non-nullptr value
- *
- * Test Steps:
- *  - Create a valid context
- *  - Create a valid tensor while allocating
- *  - Map tensor
- *  - Check that mapping is not nullptr
- */
-template <acl::Target Target>
-class MapAllocatedTensorFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-
-        acl::Context ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 8, 8 }, acl::DataType::Float32), &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        void *handle = tensor.map();
-        ARM_COMPUTE_ASSERT(handle != nullptr);
-        ARM_COMPUTE_ASSERT(tensor.unmap(handle) == acl::StatusCode::Success);
-    };
-};
-
-/** Test case for AclTensorImport
- *
- * Validate that an externally memory can be successfully imported
- *
- * Test Steps:
- *  - Create a valid context
- *  - Create a valid tensor without allocating
- *  - Allocate external memory
- *  - Import memory to the tensor
- *  - Check that imported pointer matches
- */
-template <acl::Target Target>
-class ImportMemoryFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-
-        acl::Context ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        const int32_t size = 8;
-        acl::Tensor   tensor(ctx, acl::TensorDescriptor({ size }, acl::DataType::Float32), false /* allocate */, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        std::vector<float> data(size);
-        err = tensor.import(data.data(), acl::ImportType::Host);
-
-        void *handle = tensor.map();
-        ARM_COMPUTE_ASSERT(handle == data.data());
-        ARM_COMPUTE_ASSERT(tensor.unmap(handle) == acl::StatusCode::Success);
-    }
-};
-/** Test case for get_size() interface of Tensor
- *
- *
- * Test Steps:
- *  - Create a valid context
- *  - Create a valid tensor
- *  - Compare the size value returned with the expected value
- */
-template <acl::Target Target>
-class TensorSizeFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::StatusCode err = acl::StatusCode::Success;
-        acl::Context    ctx(Target, &err);
-
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
-
-        // size should be 6 elements (2x3) times 4 bytes (float32) = 24 bytes
-        constexpr size_t expected_size = 24;
-        ARM_COMPUTE_ASSERT(tensor.get_size() == expected_size);
-    };
-};
-/** Test case for get_size() dealing with invalid arguments
- *
- * Test Steps:
- *  - Test nullptr tensor can return a correct error
- *  - Create a valid tensor
- *  - Test C interface with null size argument can return a correct error
- */
-template <acl::Target Target>
-class InvalidTensorSizeFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        // Null tensor
-        AclTensor null_tensor = nullptr;
-        uint64_t  size{ 0 };
-        ARM_COMPUTE_ASSERT(AclGetTensorSize(null_tensor, &size) == AclStatus::AclInvalidArgument);
-
-        // Create valid tensor
-        acl::StatusCode err = acl::StatusCode::Success;
-        acl::Context    ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
-
-        // Null size argument
-        ARM_COMPUTE_ASSERT(AclGetTensorSize(tensor.get(), nullptr) == AclStatus::AclInvalidArgument);
-    };
-};
-
-template <acl::Target Target>
-class DescriptorConversionFixture : public framework::Fixture
-{
-    bool compare_descriptor(const AclTensorDescriptor &desc_a, const AclTensorDescriptor &desc_b)
-    {
-        auto are_descriptors_same = true;
-
-        are_descriptors_same &= desc_a.ndims == desc_b.ndims;
-        are_descriptors_same &= desc_a.data_type == desc_b.data_type;
-        are_descriptors_same &= desc_a.shape != nullptr && desc_b.shape != nullptr;
-
-        for(int32_t d = 0; d < desc_a.ndims; ++d)
-        {
-            are_descriptors_same &= desc_a.shape[d] == desc_b.shape[d];
-        }
-
-        // other attributes should be added here
-
-        return are_descriptors_same;
-    }
-
-public:
-    void setup()
-    {
-        auto err{ acl::StatusCode::Success };
-        auto ctx{ acl::Context(Target, &err) };
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        auto        desc{ acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32) };
-        acl::Tensor tensor(ctx, desc, &err);
-
-        auto desc_from_tensor = tensor.get_descriptor();
-
-        ARM_COMPUTE_ASSERT(compare_descriptor(*desc.get(), *desc_from_tensor.get()));
-        ARM_COMPUTE_ASSERT(desc == desc_from_tensor);
-
-        // Test c interface with "prepopulated" descriptor
-        // Note: When c interface used, there are possibility of memory leak
-        // if members are not correctly deleted (e.g., shape).
-        // Since that is considered user's responsibility, we don't test here.
-        AclTensorDescriptor prepopulated_descriptor
-        {
-            3, nullptr, AclDataType::AclBFloat16, nullptr, 0
-        };
-
-        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(tensor.get(), &prepopulated_descriptor) == AclStatus::AclSuccess);
-        ARM_COMPUTE_ASSERT(compare_descriptor(*desc.get(), prepopulated_descriptor));
-        ARM_COMPUTE_ASSERT(desc == acl::TensorDescriptor(prepopulated_descriptor));
-    };
-};
-
-template <acl::Target Target>
-class InvalidDescriptorConversionFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        // Null tensor
-        AclTensor           null_tensor = nullptr;
-        AclTensorDescriptor desc{};
-        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(null_tensor, &desc) == AclStatus::AclInvalidArgument);
-
-        // Create valid tensor
-        acl::StatusCode err = acl::StatusCode::Success;
-        acl::Context    ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
-
-        // Null size argument
-        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(tensor.get(), nullptr) == AclStatus::AclInvalidArgument);
-    };
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_UNIT_TENSOR */
diff --git a/tests/validation/fixtures/UNIT/TensorFixture.h b/tests/validation/fixtures/UNIT/TensorFixture.h
new file mode 100644
index 0000000000..bfe115b3ed
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/TensorFixture.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_TENSOR_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_TENSOR_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test case for AclCreateTensor
+ *
+ * Validate that AclCreateTensor behaves as expected with invalid context
+ *
+ * Test Steps:
+ *  - Call AclCreateTensor with an invalid context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that the tensor is still nullptr
+ */
+class CreateTensorWithInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclTensor tensor = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, nullptr, nullptr, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+    };
+};
+
+/** Test-case for AclCreateTensor
+ *
+ * Validate that AclCreateTensor behaves as expected on invalid descriptor
+ *
+ * Test Steps:
+ *  - Call AclCreateTensor with valid context but invalid descriptor
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that tensor is still nullptr
+ */
+template <acl::Target Target>
+class CreateTensorWithInvalidDescriptorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+        AclTensor    tensor = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), nullptr, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+
+        // Check invalid data type
+        AclTensorDescriptor invalid_desc;
+        invalid_desc.ndims     = 4;
+        invalid_desc.data_type = static_cast<AclDataType>(-1);
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), &invalid_desc, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+
+        // Check invalid number of dimensions
+        invalid_desc.data_type = AclDataType::AclFloat32;
+        invalid_desc.ndims     = 15;
+        ARM_COMPUTE_ASSERT(AclCreateTensor(&tensor, ctx.get(), &invalid_desc, false) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+    };
+};
+
+/** Test case for AclDestroyTensor
+*
+* Validate that AclDestroyTensor behaves as expected when an invalid tensor is given
+*
+* Test Steps:
+*  - Call AclDestroyTensor with null tensor
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyTensor on empty array
+*  - Confirm that AclInvalidArgument is reported
+*  - Call AclDestroyTensor on an ACL object other than AclTensor
+*  - Confirm that AclInvalidArgument is reported
+*  - Confirm that tensor is still nullptr
+*/
+template <acl::Target Target>
+class DestroyInvalidTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        std::array<char, 256> empty_array{};
+        AclTensor tensor = nullptr;
+
+        ARM_COMPUTE_ASSERT(AclDestroyTensor(tensor) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensor(reinterpret_cast<AclTensor>(ctx.get())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensor(reinterpret_cast<AclTensor>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(tensor == nullptr);
+    };
+};
+
+/** Test case for AclCreateTensor
+ *
+ * Validate that a tensor can be created successfully
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class SimpleTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+    };
+};
+
+/** Test case for AclTensor
+ *
+ * Validate that multiple tensors can be created successfully
+ * Stress the possibility of memory leaks
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a lot of tensors
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class TensorStressFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        const unsigned int num_tensors = 1024;
+        for(unsigned int i = 0; i < num_tensors; ++i)
+        {
+            acl::Tensor tensor(ctx, acl::TensorDescriptor({ 1024, 1024 }, acl::DataType::Float32), &err);
+            ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        }
+    };
+};
+
+/** Test case for AclMapTensor
+ *
+ * Validate that map on an invalid object fails
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Pass and invalid object for mapping
+ *  - Confirm that AclInvalidArgument is returned
+ */
+template <acl::Target Target>
+class MapInvalidTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        void *handle = nullptr;
+        ARM_COMPUTE_ASSERT(AclMapTensor(reinterpret_cast<AclTensor>(ctx.get()), &handle) == AclStatus::AclInvalidArgument);
+    };
+};
+
+/** Test case for AclMapTensor
+ *
+ * Validate that map of an unallocated pointer is nullptr
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor without allocating
+ *  - Map tensor
+ *  - Check that mapping is nullptr
+ */
+template <acl::Target Target>
+class MapNotAllocatedTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 8, 8 }, acl::DataType::Float32), false /* allocate */, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        ARM_COMPUTE_ASSERT(tensor.map() == nullptr);
+    };
+};
+
+/** Test case for AclMapTensor
+ *
+ * Validate that map of a valid tensor return a non-nullptr value
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor while allocating
+ *  - Map tensor
+ *  - Check that mapping is not nullptr
+ */
+template <acl::Target Target>
+class MapAllocatedTensorFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 8, 8 }, acl::DataType::Float32), &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        void *handle = tensor.map();
+        ARM_COMPUTE_ASSERT(handle != nullptr);
+        ARM_COMPUTE_ASSERT(tensor.unmap(handle) == acl::StatusCode::Success);
+    };
+};
+
+/** Test case for AclTensorImport
+ *
+ * Validate that an externally memory can be successfully imported
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor without allocating
+ *  - Allocate external memory
+ *  - Import memory to the tensor
+ *  - Check that imported pointer matches
+ */
+template <acl::Target Target>
+class ImportMemoryFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        const int32_t size = 8;
+        acl::Tensor   tensor(ctx, acl::TensorDescriptor({ size }, acl::DataType::Float32), false /* allocate */, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        std::vector<float> data(size);
+        err = tensor.import(data.data(), acl::ImportType::Host);
+
+        void *handle = tensor.map();
+        ARM_COMPUTE_ASSERT(handle == data.data());
+        ARM_COMPUTE_ASSERT(tensor.unmap(handle) == acl::StatusCode::Success);
+    }
+};
+/** Test case for get_size() interface of Tensor
+ *
+ *
+ * Test Steps:
+ *  - Create a valid context
+ *  - Create a valid tensor
+ *  - Compare the size value returned with the expected value
+ */
+template <acl::Target Target>
+class TensorSizeFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+
+        // size should be 6 elements (2x3) times 4 bytes (float32) = 24 bytes
+        constexpr size_t expected_size = 24;
+        ARM_COMPUTE_ASSERT(tensor.get_size() == expected_size);
+    };
+};
+/** Test case for get_size() dealing with invalid arguments
+ *
+ * Test Steps:
+ *  - Test nullptr tensor can return a correct error
+ *  - Create a valid tensor
+ *  - Test C interface with null size argument can return a correct error
+ */
+template <acl::Target Target>
+class InvalidTensorSizeFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        // Null tensor
+        AclTensor null_tensor = nullptr;
+        uint64_t  size{ 0 };
+        ARM_COMPUTE_ASSERT(AclGetTensorSize(null_tensor, &size) == AclStatus::AclInvalidArgument);
+
+        // Create valid tensor
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+
+        // Null size argument
+        ARM_COMPUTE_ASSERT(AclGetTensorSize(tensor.get(), nullptr) == AclStatus::AclInvalidArgument);
+    };
+};
+
+template <acl::Target Target>
+class DescriptorConversionFixture : public framework::Fixture
+{
+    bool compare_descriptor(const AclTensorDescriptor &desc_a, const AclTensorDescriptor &desc_b)
+    {
+        auto are_descriptors_same = true;
+
+        are_descriptors_same &= desc_a.ndims == desc_b.ndims;
+        are_descriptors_same &= desc_a.data_type == desc_b.data_type;
+        are_descriptors_same &= desc_a.shape != nullptr && desc_b.shape != nullptr;
+
+        for(int32_t d = 0; d < desc_a.ndims; ++d)
+        {
+            are_descriptors_same &= desc_a.shape[d] == desc_b.shape[d];
+        }
+
+        // other attributes should be added here
+
+        return are_descriptors_same;
+    }
+
+public:
+    void setup()
+    {
+        auto err{ acl::StatusCode::Success };
+        auto ctx{ acl::Context(Target, &err) };
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        auto        desc{ acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32) };
+        acl::Tensor tensor(ctx, desc, &err);
+
+        auto desc_from_tensor = tensor.get_descriptor();
+
+        ARM_COMPUTE_ASSERT(compare_descriptor(*desc.get(), *desc_from_tensor.get()));
+        ARM_COMPUTE_ASSERT(desc == desc_from_tensor);
+
+        // Test c interface with "prepopulated" descriptor
+        // Note: When c interface used, there are possibility of memory leak
+        // if members are not correctly deleted (e.g., shape).
+        // Since that is considered user's responsibility, we don't test here.
+        AclTensorDescriptor prepopulated_descriptor
+        {
+            3, nullptr, AclDataType::AclBFloat16, nullptr, 0
+        };
+
+        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(tensor.get(), &prepopulated_descriptor) == AclStatus::AclSuccess);
+        ARM_COMPUTE_ASSERT(compare_descriptor(*desc.get(), prepopulated_descriptor));
+        ARM_COMPUTE_ASSERT(desc == acl::TensorDescriptor(prepopulated_descriptor));
+    };
+};
+
+template <acl::Target Target>
+class InvalidDescriptorConversionFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        // Null tensor
+        AclTensor           null_tensor = nullptr;
+        AclTensorDescriptor desc{};
+        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(null_tensor, &desc) == AclStatus::AclInvalidArgument);
+
+        // Create valid tensor
+        acl::StatusCode err = acl::StatusCode::Success;
+        acl::Context    ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+        acl::Tensor tensor(ctx, acl::TensorDescriptor({ 2, 3 }, acl::DataType::Float32), &err);
+
+        // Null size argument
+        ARM_COMPUTE_ASSERT(AclGetTensorDescriptor(tensor.get(), nullptr) == AclStatus::AclInvalidArgument);
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_TENSOR_FIXTURE */
diff --git a/tests/validation/fixtures/UNIT/TensorPack.h b/tests/validation/fixtures/UNIT/TensorPack.h
deleted file mode 100644
index 98bffb1665..0000000000
--- a/tests/validation/fixtures/UNIT/TensorPack.h
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_TEST_UNIT_TENSORPACK
-#define ARM_COMPUTE_TEST_UNIT_TENSORPACK
-
-#include "arm_compute/Acl.hpp"
-#include "tests/framework/Asserts.h"
-#include "tests/framework/Fixture.h"
-#include "tests/framework/Macros.h"
-#include "tests/validation/Validation.h"
-
-namespace arm_compute
-{
-namespace test
-{
-namespace validation
-{
-/** Test case for AclCreateTensorPack
- *
- * Validate that AclCreateTensorPack behaves as expected with invalid context
- *
- * Test Steps:
- *  - Call AclCreateTensorPack with an invalid context
- *  - Confirm that AclInvalidArgument is reported
- *  - Confirm that the tensor pack is still nullptr
- */
-class CreateTensorPackWithInvalidContextFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        AclTensorPack pack = nullptr;
-        ARM_COMPUTE_ASSERT(AclCreateTensorPack(&pack, nullptr) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(pack == nullptr);
-    };
-};
-
-/** Test case for AclDestroyTensorPack
- *
- * Validate that AclDestroyTensorPack behaves as expected when an invalid tensor pack is given
- *
- * Test Steps:
- *  - Call AclDestroyTensorPack with null tensor pack
- *  - Confirm that AclInvalidArgument is reported
- *  - Call AclDestroyTensorPack on empty array
- *  - Confirm that AclInvalidArgument is reported
- *  - Call AclDestroyTensorPack on an ACL object other than AclTensorPack
- *  - Confirm that AclInvalidArgument is reported
- *  - Confirm that tensor pack is still nullptr
- */
-template <acl::Target Target>
-class DestroyInvalidTensorPackFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::Context ctx(Target);
-
-        std::array<char, 256> empty_array{};
-        AclTensorPack pack = nullptr;
-
-        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(pack) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(reinterpret_cast<AclTensorPack>(ctx.get())) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(reinterpret_cast<AclTensorPack>(empty_array.data())) == AclStatus::AclInvalidArgument);
-        ARM_COMPUTE_ASSERT(pack == nullptr);
-    };
-};
-
-/** Test case for AclPackTensor
- *
- * Validate that AclPackTensor behaves as expected when an invalid is being passed for packing
- *
- * Test Steps:
- *  - Create a valid TensorPack
- *  - Try to pack an empty object
- *  - Confirm that AclInvalidArgument is reported
- *  - Try to pack another API object other than tensor
- *  - Confirm that AclInvalidArgument is reported
- */
-template <acl::Target Target>
-class AddInvalidObjectToTensorPackFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        auto err = acl::StatusCode::Success;
-
-        acl::Context ctx(Target, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        acl::TensorPack pack(ctx, &err);
-        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-
-        auto status = AclPackTensor(pack.get(),
-                                    reinterpret_cast<AclTensor>(ctx.get()),
-                                    AclTensorSlot::AclSrc);
-        ARM_COMPUTE_ASSERT(status == AclInvalidArgument);
-
-        status = AclPackTensor(pack.get(), nullptr, AclTensorSlot::AclSrc);
-        ARM_COMPUTE_ASSERT(status == AclInvalidArgument);
-    };
-};
-
-/** Test case for AclPackTensor
- *
- * Validate that a tensor can be added successfully to the TensorPack
- *
- * Test Steps:
- *  - Create a valid tensor pack
- *  - Create a valid tensor
- *  - Add tensor to the tensor pack
- *  - Confirm that AclSuccess is returned
- */
-template <acl::Target Target>
-class SimpleTensorPackFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::Context    ctx(Target);
-        acl::TensorPack pack(ctx);
-        acl::Tensor     t(ctx, acl::TensorDescriptor({ 3, 3, 5, 7 }, acl::DataType::Float32));
-
-        ARM_COMPUTE_ASSERT(pack.add(t, AclTensorSlot::AclSrc) == acl::StatusCode::Success);
-    };
-};
-
-/** Test case for AclPackTensor
- *
- * Validate that multiple tensor can be added successfully to the TensorPack
- *
- * Test Steps:
- *  - Create a valid tensor pack
- *  - Create a list of valid tensors
- *  - Add tensors to the tensor pack
- *  - Confirm that AclSuccess is returned
- */
-template <acl::Target Target>
-class MultipleTensorsInPackFixture : public framework::Fixture
-{
-public:
-    void setup()
-    {
-        acl::Context    ctx(Target);
-        acl::TensorPack pack(ctx);
-
-        const acl::TensorDescriptor desc(acl::TensorDescriptor({ 3, 3, 5, 7 }, acl::DataType::Float32));
-        const size_t                num_tensors = 256;
-
-        std::vector<acl::Tensor> tensors;
-        for(unsigned int i = 0; i < num_tensors; ++i)
-        {
-            auto err = acl::StatusCode::Success;
-            tensors.emplace_back(acl::Tensor(ctx, desc, &err));
-            ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
-            ARM_COMPUTE_ASSERT(pack.add(tensors.back(), static_cast<int32_t>(AclTensorSlot::AclSrcVec) + i) == acl::StatusCode::Success);
-        }
-    };
-};
-} // namespace validation
-} // namespace test
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_TEST_UNIT_TENSORPACK */
diff --git a/tests/validation/fixtures/UNIT/TensorPackFixture.h b/tests/validation/fixtures/UNIT/TensorPackFixture.h
new file mode 100644
index 0000000000..bc14631936
--- /dev/null
+++ b/tests/validation/fixtures/UNIT/TensorPackFixture.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_UNIT_TENSORPACK_FIXTURE
+#define ARM_COMPUTE_TEST_UNIT_TENSORPACK_FIXTURE
+
+#include "arm_compute/Acl.hpp"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/framework/Macros.h"
+#include "tests/validation/Validation.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+/** Test case for AclCreateTensorPack
+ *
+ * Validate that AclCreateTensorPack behaves as expected with invalid context
+ *
+ * Test Steps:
+ *  - Call AclCreateTensorPack with an invalid context
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that the tensor pack is still nullptr
+ */
+class CreateTensorPackWithInvalidContextFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        AclTensorPack pack = nullptr;
+        ARM_COMPUTE_ASSERT(AclCreateTensorPack(&pack, nullptr) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(pack == nullptr);
+    };
+};
+
+/** Test case for AclDestroyTensorPack
+ *
+ * Validate that AclDestroyTensorPack behaves as expected when an invalid tensor pack is given
+ *
+ * Test Steps:
+ *  - Call AclDestroyTensorPack with null tensor pack
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyTensorPack on empty array
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Call AclDestroyTensorPack on an ACL object other than AclTensorPack
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Confirm that tensor pack is still nullptr
+ */
+template <acl::Target Target>
+class DestroyInvalidTensorPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context ctx(Target);
+
+        std::array<char, 256> empty_array{};
+        AclTensorPack pack = nullptr;
+
+        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(pack) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(reinterpret_cast<AclTensorPack>(ctx.get())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(AclDestroyTensorPack(reinterpret_cast<AclTensorPack>(empty_array.data())) == AclStatus::AclInvalidArgument);
+        ARM_COMPUTE_ASSERT(pack == nullptr);
+    };
+};
+
+/** Test case for AclPackTensor
+ *
+ * Validate that AclPackTensor behaves as expected when an invalid is being passed for packing
+ *
+ * Test Steps:
+ *  - Create a valid TensorPack
+ *  - Try to pack an empty object
+ *  - Confirm that AclInvalidArgument is reported
+ *  - Try to pack another API object other than tensor
+ *  - Confirm that AclInvalidArgument is reported
+ */
+template <acl::Target Target>
+class AddInvalidObjectToTensorPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        auto err = acl::StatusCode::Success;
+
+        acl::Context ctx(Target, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        acl::TensorPack pack(ctx, &err);
+        ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+        auto status = AclPackTensor(pack.get(),
+                                    reinterpret_cast<AclTensor>(ctx.get()),
+                                    AclTensorSlot::AclSrc);
+        ARM_COMPUTE_ASSERT(status == AclInvalidArgument);
+
+        status = AclPackTensor(pack.get(), nullptr, AclTensorSlot::AclSrc);
+        ARM_COMPUTE_ASSERT(status == AclInvalidArgument);
+    };
+};
+
+/** Test case for AclPackTensor
+ *
+ * Validate that a tensor can be added successfully to the TensorPack
+ *
+ * Test Steps:
+ *  - Create a valid tensor pack
+ *  - Create a valid tensor
+ *  - Add tensor to the tensor pack
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class SimpleTensorPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context    ctx(Target);
+        acl::TensorPack pack(ctx);
+        acl::Tensor     t(ctx, acl::TensorDescriptor({ 3, 3, 5, 7 }, acl::DataType::Float32));
+
+        ARM_COMPUTE_ASSERT(pack.add(t, AclTensorSlot::AclSrc) == acl::StatusCode::Success);
+    };
+};
+
+/** Test case for AclPackTensor
+ *
+ * Validate that multiple tensor can be added successfully to the TensorPack
+ *
+ * Test Steps:
+ *  - Create a valid tensor pack
+ *  - Create a list of valid tensors
+ *  - Add tensors to the tensor pack
+ *  - Confirm that AclSuccess is returned
+ */
+template <acl::Target Target>
+class MultipleTensorsInPackFixture : public framework::Fixture
+{
+public:
+    void setup()
+    {
+        acl::Context    ctx(Target);
+        acl::TensorPack pack(ctx);
+
+        const acl::TensorDescriptor desc(acl::TensorDescriptor({ 3, 3, 5, 7 }, acl::DataType::Float32));
+        const size_t                num_tensors = 256;
+
+        std::vector<acl::Tensor> tensors;
+        for(unsigned int i = 0; i < num_tensors; ++i)
+        {
+            auto err = acl::StatusCode::Success;
+            tensors.emplace_back(acl::Tensor(ctx, desc, &err));
+            ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+            ARM_COMPUTE_ASSERT(pack.add(tensors.back(), static_cast<int32_t>(AclTensorSlot::AclSrcVec) + i) == acl::StatusCode::Success);
+        }
+    };
+};
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_UNIT_TENSORPACK_FIXTURE */
diff --git a/tests/validation/gpu/unit/Context.cpp b/tests/validation/gpu/unit/Context.cpp
index 523a0283a7..598e219fd9 100644
--- a/tests/validation/gpu/unit/Context.cpp
+++ b/tests/validation/gpu/unit/Context.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/validation/fixtures/UNIT/Context.h"
+#include "tests/validation/fixtures/UNIT/ContextFixture.h"
 
 #include "src/gpu/cl/ClContext.h"
 
@@ -37,15 +37,9 @@ TEST_SUITE(CL)
 TEST_SUITE(UNIT)
 TEST_SUITE(Context)
 
-FIXTURE_TEST_CASE(SimpleContextCApi, SimpleContextCApiFixture<AclTarget::AclGpuOcl>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(SimpleContextCppApi, SimpleContextCppApiFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(MultipleContexts, MultipleContextsFixture<AclTarget::AclGpuOcl>, framework::DatasetMode::ALL)
-{
-}
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCApi, SimpleContextCApiFixture<AclTarget::AclGpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleContextCppApi, SimpleContextCppApiFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleContexts, MultipleContextsFixture<AclTarget::AclGpuOcl>, framework::DatasetMode::ALL)
 
 /** Test-case for MLGO kernel configuration file
  *
diff --git a/tests/validation/gpu/unit/Queue.cpp b/tests/validation/gpu/unit/Queue.cpp
new file mode 100644
index 0000000000..8154a7954f
--- /dev/null
+++ b/tests/validation/gpu/unit/Queue.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "tests/validation/fixtures/UNIT/QueueFixture.h"
+
+#include "arm_compute/AclOpenClExt.h"
+#include "src/gpu/cl/ClQueue.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+TEST_SUITE(CL)
+TEST_SUITE(UNIT)
+TEST_SUITE(Queue)
+
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueueWithInvalidContext, CreateQueueWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateQueuerWithInvalidOptions, CreateQueuerWithInvalidOptionsFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidQueue, DestroyInvalidQueueFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleQueue, SimpleQueueFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+
+TEST_CASE(KhrQueuePriorities, framework::DatasetMode::ALL)
+{
+    acl::StatusCode err = acl::StatusCode::Success;
+
+    acl::Context ctx(acl::Target::GpuOcl, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    acl::Queue queue(ctx, &err);
+    ARM_COMPUTE_ASSERT(err == acl::StatusCode::Success);
+
+    cl_device_id cl_dev;
+    auto         status = AclGetClDevice(ctx.get(), &cl_dev);
+    ARM_COMPUTE_ASSERT(status == AclSuccess);
+
+    std::string extensions = cl::Device(cl_dev).getInfo<CL_DEVICE_EXTENSIONS>();
+    if(extensions.find("cl_khr_priority_hints") != std::string::npos)
+    {
+        cl_int error = CL_SUCCESS;
+
+        cl_context cl_ctx;
+        auto       status = AclGetClContext(ctx.get(), &cl_ctx);
+        ARM_COMPUTE_ASSERT(status == AclSuccess);
+
+        /* Check a queue with high priority */
+        cl_queue_properties queue_properties[] = { CL_QUEUE_PRIORITY_KHR, CL_QUEUE_PRIORITY_HIGH_KHR, 0 };
+        cl_command_queue    priority_queue     = clCreateCommandQueueWithProperties(cl_ctx, cl_dev, queue_properties, &error);
+        ARM_COMPUTE_ASSERT(error == CL_SUCCESS);
+
+        clReleaseCommandQueue(priority_queue);
+    }
+}
+
+TEST_SUITE_END() // Queue
+TEST_SUITE_END() // UNIT
+TEST_SUITE_END() // CL
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/gpu/unit/Tensor.cpp b/tests/validation/gpu/unit/Tensor.cpp
index b40d6264f5..18102733ce 100644
--- a/tests/validation/gpu/unit/Tensor.cpp
+++ b/tests/validation/gpu/unit/Tensor.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/validation/fixtures/UNIT/Tensor.h"
+#include "tests/validation/fixtures/UNIT/TensorFixture.h"
 
 namespace arm_compute
 {
@@ -33,24 +33,17 @@ TEST_SUITE(CL)
 TEST_SUITE(UNIT)
 TEST_SUITE(Tensor)
 
-#define TENSOR_TESE_CASE(name, fixture)                           \
-    FIXTURE_TEST_CASE(name, fixture, framework::DatasetMode::ALL) \
-    {                                                             \
-    }
-
-TENSOR_TESE_CASE(CreateTensorWithInvalidContext, CreateTensorWithInvalidContextFixture)
-TENSOR_TESE_CASE(CreateTensorWithInvalidDescriptor, CreateTensorWithInvalidDescriptorFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(DestroyInvalidTensor, DestroyInvalidTensorFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(SimpleTensor, SimpleTensorFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(TensorStress, TensorStressFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(MapInvalidTensor, MapInvalidTensorFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(MapAllocatedTensor, MapAllocatedTensorFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(GetSize, TensorSizeFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(GetInvalidSize, InvalidTensorSizeFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(GetDescriptor, DescriptorConversionFixture<acl::Target::GpuOcl>)
-TENSOR_TESE_CASE(GetInvalidDescriptor, InvalidDescriptorConversionFixture<acl::Target::GpuOcl>)
-
-#undef TENSOR_TEST_CASE
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidContext, CreateTensorWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorWithInvalidDescriptor, CreateTensorWithInvalidDescriptorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensor, DestroyInvalidTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensor, SimpleTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(TensorStress, TensorStressFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapInvalidTensor, MapInvalidTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MapAllocatedTensor, MapAllocatedTensorFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetSize, TensorSizeFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidSize, InvalidTensorSizeFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetDescriptor, DescriptorConversionFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(GetInvalidDescriptor, InvalidDescriptorConversionFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
 
 TEST_SUITE_END() // Tensor
 TEST_SUITE_END() // UNIT
diff --git a/tests/validation/gpu/unit/TensorPack.cpp b/tests/validation/gpu/unit/TensorPack.cpp
index b057db44ae..b62426d056 100644
--- a/tests/validation/gpu/unit/TensorPack.cpp
+++ b/tests/validation/gpu/unit/TensorPack.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "tests/validation/fixtures/UNIT/TensorPack.h"
+#include "tests/validation/fixtures/UNIT/TensorPackFixture.h"
 
 namespace arm_compute
 {
@@ -33,21 +33,11 @@ TEST_SUITE(CL)
 TEST_SUITE(UNIT)
 TEST_SUITE(TensorPack)
 
-FIXTURE_TEST_CASE(CreateTensorPackWithInvalidContext, CreateTensorPackWithInvalidContextFixture, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(DestroyInvalidTensorPack, DestroyInvalidTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(AddInvalidObjectToTensorPack, AddInvalidObjectToTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(SimpleTensorPack, SimpleTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
-{
-}
-FIXTURE_TEST_CASE(MultipleTensorsInPack, MultipleTensorsInPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
-{
-}
+EMPTY_BODY_FIXTURE_TEST_CASE(CreateTensorPackWithInvalidContext, CreateTensorPackWithInvalidContextFixture, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(DestroyInvalidTensorPack, DestroyInvalidTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(AddInvalidObjectToTensorPack, AddInvalidObjectToTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(SimpleTensorPack, SimpleTensorPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
+EMPTY_BODY_FIXTURE_TEST_CASE(MultipleTensorsInPack, MultipleTensorsInPackFixture<acl::Target::GpuOcl>, framework::DatasetMode::ALL)
 
 TEST_SUITE_END() // Tensor
 TEST_SUITE_END() // UNIT
-- 
cgit v1.2.1