From bdcdc39d89b6a6556f5c0483af5379f75eae0c55 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 22 Apr 2021 16:42:03 +0100
Subject: Enable fat binary support

Changes our build system to allow building both Neon(TM) and SVE
kernels and package them in the same binary. This will allow
runtime selection of the underlying architecture.

Adds new build option, fat_binary, for enabling this feature.

Change-Id: I8e8386149773ce28e071a2fb7ddd8c8ae0f28a4a
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5704
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 Android.bp                                         |   8 +-
 SConscript                                         | 171 ++++-----
 SConstruct                                         |  13 +
 filelist.json                                      | 288 +++++++++++++++
 scripts/clang_tidy_rules.py                        |   2 +
 src/core/NEON/SVEMath.h                            |   4 +-
 src/core/NEON/SVEMath.inl                          |   4 +-
 .../kernels/NEBatchNormalizationLayerKernel.cpp    |   7 +-
 .../kernels/batchnormalization/impl/SVE/fp16.cpp   |   4 +-
 .../kernels/batchnormalization/impl/SVE/fp32.cpp   |   4 +-
 src/core/NEON/wrapper/intrinsics/svpow.h           |  10 +-
 src/core/NEON/wrapper/svtraits.h                   |   4 +-
 src/core/NEON/wrapper/traits.h                     |   8 +-
 src/core/common/Registrars.h                       |  70 +++-
 src/core/cpu/kernels/CpuActivationKernel.cpp       |  12 +-
 src/core/cpu/kernels/CpuAddKernel.cpp              |  10 +-
 src/core/cpu/kernels/CpuElementwiseKernel.cpp      |  63 ++--
 src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp |   6 +-
 src/core/cpu/kernels/CpuScaleKernel.cpp            |  19 +-
 src/core/cpu/kernels/CpuSoftmaxKernel.cpp          |  19 +-
 src/core/cpu/kernels/activation/sve/fp16.cpp       |   4 +-
 src/core/cpu/kernels/activation/sve/fp32.cpp       |   4 +-
 src/core/cpu/kernels/add/sve/impl.cpp              | 137 ++++++++
 src/core/cpu/kernels/add/sve/impl.h                |  40 +++
 src/core/cpu/kernels/add/sve/integer.cpp           |   6 +-
 src/core/cpu/kernels/add/sve/list.h                |  97 +-----
 .../cpu/kernels/elementwise/sve/elementwise.cpp    | 309 ++++++++++++++++
 .../cpu/kernels/elementwise/sve/elementwise_list.h | 265 ++------------
 .../elementwise/sve/elementwise_quantized_list.h   |  19 +-
 .../kernels/elementwise/sve/elementwise_unary.cpp  | 111 ++++++
 .../elementwise/sve/elementwise_unary_list.h       |  78 +----
 src/core/cpu/kernels/floor/NEON/fp16.cpp           |  64 ----
 src/core/cpu/kernels/floor/NEON/fp32.cpp           |  61 ----
 src/core/cpu/kernels/floor/neon/fp16.cpp           |  64 ++++
 src/core/cpu/kernels/floor/neon/fp32.cpp           |  61 ++++
 src/core/cpu/kernels/scale/sve/fp16.cpp            |   8 +-
 src/core/cpu/kernels/scale/sve/fp32.cpp            |   4 +-
 src/core/cpu/kernels/scale/sve/integer.cpp         |   7 +-
 src/core/cpu/kernels/scale/sve/qasymm8.cpp         |  11 +-
 src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp  |  11 +-
 src/core/cpu/kernels/softmax/impl/NEON/list.h      | 388 ---------------------
 src/core/cpu/kernels/softmax/impl/SVE/list.h       | 353 -------------------
 src/core/cpu/kernels/softmax/impl/neon/list.h      | 388 +++++++++++++++++++++
 src/core/cpu/kernels/softmax/impl/sve/impl.cpp     | 185 ++++++++++
 src/core/cpu/kernels/softmax/impl/sve/list.h       | 223 ++++++++++++
 tests/validation/NEON/ActivationLayer.cpp          |  24 +-
 tests/validation/NEON/ArithmeticAddition.cpp       |   6 +-
 47 files changed, 2120 insertions(+), 1534 deletions(-)
 create mode 100644 filelist.json
 create mode 100644 src/core/cpu/kernels/add/sve/impl.cpp
 create mode 100644 src/core/cpu/kernels/add/sve/impl.h
 create mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise.cpp
 create mode 100644 src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
 delete mode 100644 src/core/cpu/kernels/floor/NEON/fp16.cpp
 delete mode 100644 src/core/cpu/kernels/floor/NEON/fp32.cpp
 create mode 100644 src/core/cpu/kernels/floor/neon/fp16.cpp
 create mode 100644 src/core/cpu/kernels/floor/neon/fp32.cpp
 delete mode 100644 src/core/cpu/kernels/softmax/impl/NEON/list.h
 delete mode 100644 src/core/cpu/kernels/softmax/impl/SVE/list.h
 create mode 100644 src/core/cpu/kernels/softmax/impl/neon/list.h
 create mode 100644 src/core/cpu/kernels/softmax/impl/sve/impl.cpp
 create mode 100644 src/core/cpu/kernels/softmax/impl/sve/list.h

diff --git a/Android.bp b/Android.bp
index d1003f2d7d..cbb58b1ce9 100644
--- a/Android.bp
+++ b/Android.bp
@@ -314,12 +314,15 @@ cc_library_static {
         "src/core/cpu/kernels/add/neon/qasymm8.cpp",
         "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp",
         "src/core/cpu/kernels/add/neon/qsymm16.cpp",
+        "src/core/cpu/kernels/add/sve/impl.cpp",
         "src/core/cpu/kernels/add/sve/integer.cpp",
         "src/core/cpu/kernels/add/sve/qasymm8.cpp",
         "src/core/cpu/kernels/add/sve/qasymm8_signed.cpp",
         "src/core/cpu/kernels/add/sve/qsymm16.cpp",
-        "src/core/cpu/kernels/floor/NEON/fp16.cpp",
-        "src/core/cpu/kernels/floor/NEON/fp32.cpp",
+        "src/core/cpu/kernels/elementwise/sve/elementwise.cpp",
+        "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp",
+        "src/core/cpu/kernels/floor/neon/fp16.cpp",
+        "src/core/cpu/kernels/floor/neon/fp32.cpp",
         "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp",
         "src/core/cpu/kernels/pooling/neon/fp16.cpp",
         "src/core/cpu/kernels/pooling/neon/fp32.cpp",
@@ -335,6 +338,7 @@ cc_library_static {
         "src/core/cpu/kernels/scale/sve/integer.cpp",
         "src/core/cpu/kernels/scale/sve/qasymm8.cpp",
         "src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp",
+        "src/core/cpu/kernels/softmax/impl/sve/impl.cpp",
         "src/core/cpu/kernels/sub/neon/integer.cpp",
         "src/core/cpu/kernels/sub/neon/qasymm8.cpp",
         "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp",
diff --git a/SConscript b/SConscript
index 143823d013..94ba6d423f 100644
--- a/SConscript
+++ b/SConscript
@@ -26,6 +26,7 @@ import subprocess
 import zlib
 import base64
 import string
+import json
 
 VERSION = "v0.0-unreleased"
 LIBRARY_VERSION_MAJOR = 23
@@ -38,13 +39,20 @@ Import('vars')
 Import('install_lib')
 
 def build_bootcode_objs(sources):
-
     arm_compute_env.Append(ASFLAGS = "-I bootcode/")
     obj = arm_compute_env.Object(sources)
     obj = install_lib(obj)
     Default(obj)
     return obj
 
+def build_sve_objs(sources):
+    tmp_env = arm_compute_env.Clone()
+    tmp_env.Append(CXXFLAGS = "-march=armv8.2-a+sve+fp16")
+    obj = tmp_env.SharedObject(sources)
+    obj = install_lib(obj)
+    Default(obj)
+    return obj
+
 def build_library(name, build_env, sources, static=False, libs=[]):
     if static:
         obj = build_env.StaticLibrary(name, source=sources, LIBS = arm_compute_env["LIBS"] + libs)
@@ -172,6 +180,9 @@ arm_compute_env.Append(CPPPATH =[Dir("./src/core/").path] )
 
 arm_compute_env.Append(LIBS = ['dl'])
 
+with (open(Dir('#').path + '/filelist.json')) as fp:
+    filelist = json.load(fp)
+
 core_files = Glob('src/core/*.cpp')
 core_files += Glob('src/core/CPP/*.cpp')
 core_files += Glob('src/core/CPP/kernels/*.cpp')
@@ -189,25 +200,14 @@ runtime_files += Glob('src/runtime/CPP/ICPPSimpleFunction.cpp')
 runtime_files += Glob('src/runtime/CPP/functions/*.cpp')
 
 # C API files
-c_api_files = ['src/c/AclContext.cpp',
-               'src/c/AclQueue.cpp',
-               'src/c/AclTensor.cpp',
-               'src/c/AclTensorPack.cpp',
-               'src/c/AclVersion.cpp',
-               ]
+runtime_files += filelist['c_api']['cpu']
+
 if env['opencl']:
-    c_api_files += ['src/c/cl/AclOpenClExt.cpp']
+    runtime_files += filelist['c_api']['gpu']
 
 # Common backend files
-common_backend_files = ['src/common/utils/LegacySupport.cpp',
-                        'src/common/AllocatorWrapper.cpp',
-                        'src/common/ITensorV2.cpp',
-                        'src/common/TensorPack.cpp',
-                        ]
-
-core_files += common_backend_files
-runtime_files += c_api_files
-# CLHarrisCorners uses the Scheduler to run CPP kernels
+core_files += filelist['common']
+
 runtime_files += Glob('src/runtime/CPP/SingleThreadScheduler.cpp')
 
 graph_files = Glob('src/graph/*.cpp')
@@ -231,9 +231,7 @@ if env['opencl']:
                           ]
     core_files += cl_kernel_hp_files
     core_files += Glob('src/core/CL/*.cpp')
-    core_files += Glob('src/core/CL/kernels/*.cpp')
     core_files += Glob('src/core/gpu/cl/*.cpp')
-    core_files += Glob('src/core/gpu/cl/kernels/*.cpp')
 
     runtime_files += Glob('src/runtime/CL/*.cpp')
     runtime_files += Glob('src/runtime/CL/functions/*.cpp')
@@ -245,10 +243,12 @@ if env['opencl']:
     runtime_files += Glob('src/runtime/CL/gemm_auto_heuristics/*.cpp')
 
     runtime_files += Glob('src/gpu/cl/*.cpp')
-
     graph_files += Glob('src/graph/backends/CL/*.cpp')
 
+    core_files += filelist['gpu']['core']['kernels']['high_priority'] + filelist['gpu']['core']['kernels']['all']
 
+sve_o = []
+core_files_sve = []
 if env['neon']:
     core_files += Glob('src/core/NEON/*.cpp')
     core_files += Glob('src/core/NEON/kernels/*.cpp')
@@ -277,107 +277,47 @@ if env['neon']:
     if env['estate'] == '64':
         core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/a64_*/*.cpp')
         core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/a64_*/*.cpp')
-        if "sve" in env['arch']:
-             core_files += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
-             core_files += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp')
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['all']
+            core_files_sve += Glob('src/core/NEON/kernels/arm_gemm/kernels/sve_*/*.cpp')
+            core_files_sve += Glob('src/core/NEON/kernels/arm_conv/pooling/kernels/sve_*/*.cpp')
+
+    if any(i in env['data_layout_support'] for i in ['all', 'nchw']):
+        core_files += filelist['cpu']['core']['neon']['nchw']
 
     if any(i in env['data_type_support'] for i in ['all', 'fp16']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/fp16.cpp')
+        if not "sve" in env['arch'] or env['fat_binary']:
+            core_files += filelist['cpu']['core']['neon']['fp16']
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['fp16']
     if any(i in env['data_type_support'] for i in ['all', 'fp32']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/fp32.cpp')
+        if not "sve" in env['arch'] or env['fat_binary']:
+            core_files += filelist['cpu']['core']['neon']['fp32']
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['fp32']
     if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8.cpp')
+        core_files += filelist['cpu']['core']['neon']['qasymm8']
+        core_files_sve += filelist['cpu']['core']['sve']['qasymm8']
     if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/qasymm8_signed.cpp')
+        core_files += filelist['cpu']['core']['neon']['qasymm8_signed']
+        core_files_sve += filelist['cpu']['core']['sve']['qasymm8_signed']
     if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/qsymm16.cpp')
+        core_files += filelist['cpu']['core']['neon']['qsymm16']
+        core_files_sve += filelist['cpu']['core']['sve']['qsymm16']
     if any(i in env['data_type_support'] for i in ['all', 'integer']):
-        core_files += Glob('src/core/NEON/kernels/*/impl/*/integer.cpp')
+        if not "sve" in env['arch'] or env['fat_binary']:
+            core_files += filelist['cpu']['core']['neon']['integer']
+        if "sve" in env['arch'] or env['fat_binary']:
+            core_files_sve += filelist['cpu']['core']['sve']['integer']
+
+    core_files += Glob('src/core/cpu/kernels/*/*.cpp')
+    core_files += filelist['cpu']['core']['kernels']['high_priority'] + filelist['cpu']['core']['kernels']['all']
 
     runtime_files += Glob('src/runtime/NEON/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/*.cpp')
     runtime_files += Glob('src/runtime/NEON/functions/assembly/*.cpp')
-
-    cpu_kernel_hp_files = ['src/core/cpu/kernels/CpuActivationKernel.cpp',
-                           'src/core/cpu/kernels/CpuCastKernel.cpp',
-                           'src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConv2dKernel.cpp',
-                           'src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp',
-                           'src/core/cpu/kernels/CpuPermuteKernel.cpp',
-                           'src/core/cpu/kernels/CpuPool2dKernel.cpp',
-                           'src/core/cpu/kernels/CpuReshapeKernel.cpp',
-                          ]
-    cpu_kernel_files = ['src/core/cpu/kernels/CpuAddKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp',
-                        'src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp',
-                        'src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp',
-                        'src/core/cpu/kernels/CpuCopyKernel.cpp',
-                        'src/core/cpu/kernels/CpuDequantizeKernel.cpp',
-                        'src/core/cpu/kernels/CpuElementwiseKernel.cpp',
-                        'src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp',
-                        'src/core/cpu/kernels/CpuFillKernel.cpp',
-                        'src/core/cpu/kernels/CpuFloorKernel.cpp',
-                        'src/core/cpu/kernels/CpuMulKernel.cpp',
-                        'src/core/cpu/kernels/CpuQuantizeKernel.cpp',
-                        'src/core/cpu/kernels/CpuScaleKernel.cpp',
-                        'src/core/cpu/kernels/CpuSoftmaxKernel.cpp',
-                        'src/core/cpu/kernels/CpuSubKernel.cpp',
-                        'src/core/cpu/kernels/CpuTransposeKernel.cpp',
-                       ]
-    core_files += [cpu_kernel_hp_files, cpu_kernel_files]
-
-    core_files += Glob('src/core/cpu/kernels/*/*.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'fp16']):
-        core_files += Glob('src/core/cpu/kernels/*/*/fp16.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'fp32']):
-        core_files += Glob('src/core/cpu/kernels/*/*/fp32.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'qasymm8']):
-        core_files += Glob('src/core/cpu/kernels/*/*/qasymm8.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'qasymm8_signed']):
-        core_files += Glob('src/core/cpu/kernels/*/*/qasymm8_signed.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'qsymm16']):
-        core_files += Glob('src/core/cpu/kernels/*/*/qsymm16.cpp')
-    if any(i in env['data_type_support'] for i in ['all', 'integer']):
-        core_files += Glob('src/core/cpu/kernels/*/*/integer.cpp')
-
-    if any(i in env['data_layout_support'] for i in ['all', 'nchw']):
-        core_files += Glob('src/core/cpu/kernels/*/*/nchw/all.cpp')
-
-    cpu_rt_files = ['src/cpu/CpuContext.cpp',
-                    'src/cpu/CpuQueue.cpp',
-                    'src/cpu/CpuTensor.cpp'
-                   ]
-    cpu_operator_hp_files = ['src/runtime/cpu/operators/CpuActivation.cpp',
-                             'src/runtime/cpu/operators/CpuCast.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp',
-                             'src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp',
-                             'src/runtime/cpu/operators/CpuDirectConv2d.cpp',
-                             'src/runtime/cpu/operators/CpuFlatten.cpp',
-                             'src/runtime/cpu/operators/CpuPermute.cpp',
-                             'src/runtime/cpu/operators/CpuPool2d.cpp',
-                            ]
-    cpu_operator_files = ['src/runtime/cpu/operators/CpuAdd.cpp',
-                          'src/runtime/cpu/operators/CpuConcatenate.cpp',
-                          'src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp',
-                          'src/runtime/cpu/operators/CpuCopy.cpp',
-                          'src/runtime/cpu/operators/CpuDequantize.cpp',
-                          'src/runtime/cpu/operators/CpuElementwise.cpp',
-                          'src/runtime/cpu/operators/CpuElementwiseUnary.cpp',
-                          'src/runtime/cpu/operators/CpuFill.cpp',
-                          'src/runtime/cpu/operators/CpuFloor.cpp',
-                          'src/runtime/cpu/operators/CpuMul.cpp',
-                          'src/runtime/cpu/operators/CpuQuantize.cpp',
-                          'src/runtime/cpu/operators/CpuReshape.cpp',
-                          'src/runtime/cpu/operators/CpuScale.cpp',
-                          'src/runtime/cpu/operators/CpuSoftmax.cpp',
-                          'src/runtime/cpu/operators/CpuSub.cpp',
-                          'src/runtime/cpu/operators/CpuTranspose.cpp',
-                          'src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp',
-                         ]
-    cpu_internal_operator_files = ['src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp',]
-    runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files, cpu_internal_operator_files ]
+    runtime_files += filelist['cpu']['runtime']['all'] + filelist['cpu']['runtime']['operators']['high_priority'] \
+                  + filelist['cpu']['runtime']['operators']['all'] + filelist['cpu']['runtime']['operators']['internal']
 
 bootcode_o = []
 if env['os'] == 'bare_metal':
@@ -385,11 +325,18 @@ if env['os'] == 'bare_metal':
     bootcode_o = build_bootcode_objs(bootcode_files)
 Export('bootcode_o')
 
-arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files, static=True)
+if (env['fat_binary']):
+    sve_o = build_sve_objs(core_files_sve)
+    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files + sve_o, static=True)
+else:
+    arm_compute_core_a = build_library('arm_compute_core-static', arm_compute_env, core_files + core_files_sve, static=True)
 Export('arm_compute_core_a')
 
 if env['os'] != 'bare_metal' and not env['standalone']:
-    arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files, static=False)
+    if (env['fat_binary']):
+        arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files + sve_o, static=False)
+    else:
+        arm_compute_core_so = build_library('arm_compute_core', arm_compute_env, core_files + core_files_sve, static=False)
     Export('arm_compute_core_so')
 
 arm_compute_a = build_library('arm_compute-static', arm_compute_env, runtime_files, static=True, libs = [ arm_compute_core_a ])
diff --git a/SConstruct b/SConstruct
index d36fbab275..f800d9d105 100644
--- a/SConstruct
+++ b/SConstruct
@@ -53,6 +53,7 @@ vars.AddVariables(
     BoolVariable("examples", "Build example programs", True),
     BoolVariable("gemm_tuner", "Build gemm_tuner programs", True),
     BoolVariable("Werror", "Enable/disable the -Werror compilation flag", True),
+    BoolVariable("fat_binary", "Build fat binary version of library. Note works only for armv8.2-a", False),
     BoolVariable("standalone", "Builds the tests as standalone executables, links statically with libgcc, libstdc++ and libarm_compute", False),
     BoolVariable("opencl", "Enable OpenCL support", True),
     BoolVariable("neon", "Enable Arm® Neon™ support", False),
@@ -255,6 +256,11 @@ if 'x86' not in env['arch']:
         elif env['os'] == 'tizen':
             prefix = "aarch64-tizen-linux-gnu-"
 
+if 'sve' in env['arch']:
+    env.Append(CXXFLAGS = ['-DENABLE_SVE'])
+else:
+    env.Append(CXXFLAGS = ['-DENABLE_NEON'])
+
 if env['build'] == 'native':
     prefix = ""
 
@@ -298,6 +304,13 @@ if not GetOption("help"):
         if not version_at_least(compiler_ver, '7.0.0') and env['os'] == 'bare_metal':
             env.Append(LINKFLAGS = ['-fstack-protector-strong'])
 
+if env['fat_binary']:
+    if env['arch'] != 'armv8.2-a':
+        print("Currently fat binary is only supported with armv8.2-a")
+        Exit(1)
+    env.Append(CXXFLAGS = ['-DENABLE_SVE'])
+    env.Append(CXXFLAGS = ['-DENABLE_NEON'])
+
 if env['data_type_support']:
     if any(i in env['data_type_support'] for i in ['all', 'fp16']):
         env.Append(CXXFLAGS = ['-DENABLE_FP16_KERNELS'])
diff --git a/filelist.json b/filelist.json
new file mode 100644
index 0000000000..d84a350a82
--- /dev/null
+++ b/filelist.json
@@ -0,0 +1,288 @@
+{
+    "common" : [
+        "src/common/utils/LegacySupport.cpp",
+        "src/common/AllocatorWrapper.cpp",
+        "src/common/ITensorV2.cpp",
+        "src/common/TensorPack.cpp"
+    ],
+    "c_api" :
+    {
+        "cpu":    [
+            "src/c/AclContext.cpp",
+            "src/c/AclQueue.cpp",
+            "src/c/AclTensor.cpp",
+            "src/c/AclTensorPack.cpp",
+            "src/c/AclVersion.cpp"
+        ],
+        "gpu": [
+            "src/c/cl/AclOpenClExt.cpp"
+        ]
+    },
+
+    "gpu" :
+    {
+        "core" :
+        {
+            "kernels" :
+            {
+                "high_priority" : [
+                    "src/core/gpu/cl/kernels/ClActivationKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClDirectConv2dKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClPermuteKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClPool2dKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClReshapeKernel.cpp"
+                ],
+                "all" : [
+                    "src/core/gpu/cl/kernels/ClBatchConcatenateKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClCastKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClCopyKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClCropKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClDepthConcatenateKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClDequantizeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClElementwiseKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClFillKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClFloorKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClHeightConcatenateKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClMulKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClQuantizeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClScaleKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClSoftmaxKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClTransposeKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp",
+                    "src/core/gpu/cl/kernels/ClWidthConcatenateKernel.cpp",
+                    "src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp",
+                    "src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp",
+                    "src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp",
+                    "src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp",
+                    "src/core/CL/kernels/CLBitwiseKernel.cpp",
+                    "src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp",
+                    "src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp",
+                    "src/core/CL/kernels/CLCol2ImKernel.cpp",
+                    "src/core/CL/kernels/CLComparisonKernel.cpp",
+                    "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp",
+                    "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp",
+                    "src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp",
+                    "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp",
+                    "src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp",
+                    "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp",
+                    "src/core/CL/kernels/CLFFTDigitReverseKernel.cpp",
+                    "src/core/CL/kernels/CLFFTRadixStageKernel.cpp",
+                    "src/core/CL/kernels/CLFFTScaleKernel.cpp",
+                    "src/core/CL/kernels/CLFillBorderKernel.cpp",
+                    "src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
+                    "src/core/CL/kernels/CLGatherKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.cpp",
+                    "src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp",
+                    "src/core/CL/kernels/CLIm2ColKernel.cpp",
+                    "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp",
+                    "src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp",
+                    "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp",
+                    "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp",
+                    "src/core/CL/kernels/CLMinMaxLayerKernel.cpp",
+                    "src/core/CL/kernels/CLNormalizationLayerKernel.cpp",
+                    "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp",
+                    "src/core/CL/kernels/CLPadLayerKernel.cpp",
+                    "src/core/CL/kernels/CLPriorBoxLayerKernel.cpp",
+                    "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp",
+                    "src/core/CL/kernels/CLRangeKernel.cpp",
+                    "src/core/CL/kernels/CLReductionOperationKernel.cpp",
+                    "src/core/CL/kernels/CLRemapKernel.cpp",
+                    "src/core/CL/kernels/CLReorgLayerKernel.cpp",
+                    "src/core/CL/kernels/CLReverseKernel.cpp",
+                    "src/core/CL/kernels/CLROIAlignLayerKernel.cpp",
+                    "src/core/CL/kernels/CLROIPoolingLayerKernel.cpp",
+                    "src/core/CL/kernels/CLSelectKernel.cpp",
+                    "src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp",
+                    "src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp",
+                    "src/core/CL/kernels/CLStackLayerKernel.cpp",
+                    "src/core/CL/kernels/CLStridedSliceKernel.cpp",
+                    "src/core/CL/kernels/CLTileKernel.cpp",
+                    "src/core/CL/kernels/CLWeightsReshapeKernel.cpp",
+                    "src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp",
+                    "src/core/CL/kernels/CLWinogradInputTransformKernel.cpp",
+                    "src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp"
+                ]
+            }
+        }
+    },
+    "cpu" :
+    {
+        "runtime" :
+        {
+            "all" : [
+                "src/cpu/CpuContext.cpp",
+                "src/cpu/CpuQueue.cpp",
+                "src/cpu/CpuTensor.cpp"
+            ],
+            "operators" :
+            {
+                "high_priority" : [
+                    "src/runtime/cpu/operators/CpuActivation.cpp",
+                    "src/runtime/cpu/operators/CpuDepthwiseConv2d.cpp",
+                    "src/runtime/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp",
+                    "src/runtime/cpu/operators/CpuDirectConv2d.cpp",
+                    "src/runtime/cpu/operators/CpuPermute.cpp",
+                    "src/runtime/cpu/operators/CpuPool2d.cpp"
+                ],
+                "internal" : [
+                    "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp"
+                ],
+                "all" : [
+                    "src/runtime/cpu/operators/CpuAdd.cpp",
+                    "src/runtime/cpu/operators/CpuCast.cpp",
+                    "src/runtime/cpu/operators/CpuConcatenate.cpp",
+                    "src/runtime/cpu/operators/CpuConvertFullyConnectedWeights.cpp",
+                    "src/runtime/cpu/operators/CpuCopy.cpp",
+                    "src/runtime/cpu/operators/CpuDequantize.cpp",
+                    "src/runtime/cpu/operators/CpuElementwise.cpp",
+                    "src/runtime/cpu/operators/CpuElementwiseUnary.cpp",
+                    "src/runtime/cpu/operators/CpuFill.cpp",
+                    "src/runtime/cpu/operators/CpuFlatten.cpp",
+                    "src/runtime/cpu/operators/CpuFloor.cpp",
+                    "src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
+                    "src/runtime/cpu/operators/CpuMul.cpp",
+                    "src/runtime/cpu/operators/CpuQuantize.cpp",
+                    "src/runtime/cpu/operators/CpuReshape.cpp",
+                    "src/runtime/cpu/operators/CpuScale.cpp",
+                    "src/runtime/cpu/operators/CpuSoftmax.cpp",
+                    "src/runtime/cpu/operators/CpuSub.cpp",
+                    "src/runtime/cpu/operators/CpuTranspose.cpp"
+                ]
+            }
+        },
+        "core" :
+        {
+            "kernels" :
+            {
+                "high_priority" : [
+                    "src/core/cpu/kernels/CpuActivationKernel.cpp",
+                    "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp",
+                    "src/core/cpu/kernels/CpuDirectConv2dKernel.cpp",
+                    "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp",
+                    "src/core/cpu/kernels/CpuPermuteKernel.cpp",
+                    "src/core/cpu/kernels/CpuPool2dKernel.cpp",
+                    "src/core/cpu/kernels/CpuReshapeKernel.cpp"
+                ],
+                "all" : [
+                    "src/core/cpu/kernels/CpuAddKernel.cpp",
+                    "src/core/cpu/kernels/CpuCastKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateBatchKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateDepthKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp",
+                    "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
+                    "src/core/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp",
+                    "src/core/cpu/kernels/CpuCopyKernel.cpp",
+                    "src/core/cpu/kernels/CpuDequantizeKernel.cpp",
+                    "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
+                    "src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp",
+                    "src/core/cpu/kernels/CpuFillKernel.cpp",
+                    "src/core/cpu/kernels/CpuFloorKernel.cpp",
+                    "src/core/cpu/kernels/CpuMulKernel.cpp",
+                    "src/core/cpu/kernels/CpuQuantizeKernel.cpp",
+                    "src/core/cpu/kernels/CpuScaleKernel.cpp",
+                    "src/core/cpu/kernels/CpuSoftmaxKernel.cpp",
+                    "src/core/cpu/kernels/CpuSubKernel.cpp",
+                    "src/core/cpu/kernels/CpuTransposeKernel.cpp"
+                ]
+            },
+
+            "sve" :
+            {
+                "all" : [
+                    "src/core/cpu/kernels/add/sve/impl.cpp",
+                    "src/core/cpu/kernels/softmax/impl/sve/impl.cpp",
+                    "src/core/cpu/kernels/elementwise/sve/elementwise.cpp",
+                    "src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp"
+                ],
+                "fp32" : [
+                    "src/core/cpu/kernels/activation/sve/fp32.cpp",
+                    "src/core/cpu/kernels/scale/sve/fp32.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp"
+                ],
+                "fp16" : [
+                    "src/core/cpu/kernels/activation/sve/fp16.cpp",
+                    "src/core/cpu/kernels/scale/sve/fp16.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp"
+                ],
+                "qsymm16" : [
+                    "src/core/cpu/kernels/activation/sve/qsymm16.cpp",
+                    "src/core/cpu/kernels/add/sve/qsymm16.cpp"
+                ],
+                "qasymm8" : [
+                    "src/core/cpu/kernels/activation/sve/qasymm8.cpp",
+                    "src/core/cpu/kernels/add/sve/qasymm8.cpp",
+                    "src/core/cpu/kernels/scale/sve/qasymm8.cpp"
+                ],
+                "qasymm8_signed" : [
+                    "src/core/cpu/kernels/activation/sve/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/add/sve/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp"
+                ],
+                "integer" : [
+                    "src/core/cpu/kernels/add/sve/integer.cpp",
+                    "src/core/cpu/kernels/scale/sve/integer.cpp"
+                ]
+            },
+
+            "neon":
+            {
+                "nchw" : [
+                    "src/core/cpu/kernels/pooling/neon/nchw/all.cpp"
+                ],
+                "fp32" : [
+                    "src/core/cpu/kernels/activation/neon/fp32.cpp",
+                    "src/core/cpu/kernels/floor/neon/fp32.cpp",
+                    "src/core/cpu/kernels/pooling/neon/fp32.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp"
+                ],
+                "fp16" : [
+                    "src/core/cpu/kernels/activation/neon/fp16.cpp",
+                    "src/core/cpu/kernels/floor/neon/fp16.cpp",
+                    "src/core/cpu/kernels/pooling/neon/fp16.cpp",
+                    "src/core/cpu/kernels/scale/neon/fp16.cpp",
+                    "src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp"
+                ],
+                "qsymm16" : [
+                    "src/core/cpu/kernels/activation/neon/qsymm16.cpp",
+                    "src/core/cpu/kernels/add/neon/qsymm16.cpp",
+                    "src/core/cpu/kernels/sub/neon/qsymm16.cpp"
+
+                ],
+                "qasymm8" : [
+                    "src/core/cpu/kernels/activation/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/add/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/pooling/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/scale/neon/qasymm8.cpp",
+                    "src/core/cpu/kernels/sub/neon/qasymm8.cpp"
+                ],
+                "qasymm8_signed" : [
+                    "src/core/cpu/kernels/activation/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/add/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/pooling/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/scale/neon/qasymm8_signed.cpp",
+                    "src/core/cpu/kernels/sub/neon/qasymm8_signed.cpp"
+                ],
+                "integer" : [
+                    "src/core/cpu/kernels/sub/neon/integer.cpp",
+                    "src/core/cpu/kernels/add/neon/integer.cpp"
+                ]
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py
index 649d9343bb..8ab7c13a7c 100755
--- a/scripts/clang_tidy_rules.py
+++ b/scripts/clang_tidy_rules.py
@@ -63,6 +63,7 @@ def filter_clang_tidy_lines( lines ):
                 ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or
                 ("CPUUtils.cpp" in line and "'asm/hwcap.h' file not found" in line) or
                 ("CPUUtils.cpp" in line and "use of undeclared identifier 'HWCAP_SVE'" in line) or
+               ("sve" in line) or
                 ("'arm_compute_version.embed' file not found" in line) ):
                 print_context=False
                 continue
@@ -115,6 +116,7 @@ def filter_clang_tidy_lines( lines ):
                ("CPUUtils.cpp" in line and "parameter 'cpusv' is unused" in line) or
                ("CPUUtils.cpp" in line and "warning: uninitialized record type" in line) or
                ("Utils.h" in line and "warning: Use of zero-allocated memory" in line) or
+               ("sve" in line) or
                ("CpuDepthwiseConv2dNativeKernel.cpp" in line and "misc-non-private-member-variables-in-classes" in line)): # This is to prevent false positive, should be reassessed with the newer clang-tidy
                 print_context=False
                 continue
diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index b73043a435..dde75e8088 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h
@@ -24,7 +24,7 @@
 #ifndef ARM_COMPUTE_SVEMATH_H
 #define ARM_COMPUTE_SVEMATH_H
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "src/core/NEON/wrapper/intrinsics/svcvt.h"
 #include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
@@ -185,5 +185,5 @@ int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in
 
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 #endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index d909adfeb5..7625e5be34 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl
@@ -24,7 +24,7 @@
 #include <cmath>
 #include <limits>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(__ARM_FEATURE_SVE) && defined(ENABLE_SVE)
 
 #ifndef M_PI
 #define M_PI (3.14159265358979323846)
@@ -388,4 +388,4 @@ inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const sv
 #endif /* defined(__ARM_FEATURE_SVE2) */
 
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 1691943b07..92000bb2f6 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
@@ -63,7 +63,7 @@ struct BatchNormalizationKernel
 
 static const BatchNormalizationKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp16_sve_batch_normalization",
         [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
@@ -74,7 +74,8 @@ static const BatchNormalizationKernel available_kernels[] =
         [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* !defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "fp16_neon_batch_normalization",
@@ -87,7 +88,7 @@ static const BatchNormalizationKernel available_kernels[] =
         [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
     },
-#endif /* !defined(__ARM_FEATURE_SVE) */
+#endif /* !defined(ENABLE_NEON) */
 };
 
 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 3e3e81d044..a715b9d3ee 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -29,7 +29,7 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -114,4 +114,4 @@ void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
+#endif // ENABLE_SVE
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index b0d4cbb684..7cc570d8aa 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -29,7 +29,7 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -114,4 +114,4 @@ void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mea
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
+#endif // ENABLE_SVE
diff --git a/src/core/NEON/wrapper/intrinsics/svpow.h b/src/core/NEON/wrapper/intrinsics/svpow.h
index e89a4ab8f6..0f58d758cb 100644
--- a/src/core/NEON/wrapper/intrinsics/svpow.h
+++ b/src/core/NEON/wrapper/intrinsics/svpow.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,8 +35,16 @@ namespace wrapper
         return svpow_##postfix##_z(pg, a, b);                      \
     }
 
+#define SVPOW_Z_IMPL_INT(type, postfix)                            \
+    inline type svpow_z(svbool_t pg, const type &a, const type &b) \
+    {                                                              \
+        ARM_COMPUTE_UNUSED(pg, a, b);                              \
+        ARM_COMPUTE_ERROR("Not supported");                        \
+    }
+
 SVPOW_Z_IMPL(svfloat32_t, f32)
 SVPOW_Z_IMPL(svfloat16_t, f16)
+SVPOW_Z_IMPL_INT(svint16_t, s16)
 
 #undef SVPOW_Z_IMPL
 
diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 465983d16f..8d2d660659 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h
@@ -23,7 +23,7 @@
  */
 #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H
 #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "src/core/NEON/SVEMath.h"
 #include <arm_sve.h>
 
@@ -66,5 +66,5 @@ DEFINE_TYPES(bfloat16_t)
 } // namespace wrapper
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 #endif /* #ifndef SRC_CORE_NEON_WRAPPER_SVTRAITS_H */
diff --git a/src/core/NEON/wrapper/traits.h b/src/core/NEON/wrapper/traits.h
index 3452b76761..81685140f1 100644
--- a/src/core/NEON/wrapper/traits.h
+++ b/src/core/NEON/wrapper/traits.h
@@ -26,9 +26,9 @@
 
 #include <arm_neon.h>
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include <arm_sve.h>
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
 namespace arm_compute
 {
@@ -116,13 +116,13 @@ template <> struct neon_bitvector<float16_t, BitWidth::W128>{ using type = float
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 /** Create the appropriate SVE vector given its type */
 template <typename T> struct sve_vector;
 
 template <> struct sve_vector<uint8_t>{ using scalar_type = uint8_t; using type = svuint8_t; };
 template <> struct sve_vector<int8_t>{ using scalar_type = int8_t; using type = svint8_t; };
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
 #endif /* DOXYGEN_SKIP_THIS */
 
diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index 112c83ad94..44ddf9808d 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h
@@ -26,17 +26,17 @@
 
 #if defined(ENABLE_FP16_KERNELS)
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #define REGISTER_FP16_SVE(func_name) &(func_name)
-#else /* !defined(__ARM_FEATURE_SVE) */
+#else /* !defined(ENABLE_SVE) */
 #define REGISTER_FP16_SVE(func_name) nullptr
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
 
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #define REGISTER_FP16_NEON(func_name) &(func_name)
-#else /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#else /* !defined(ENABLE_NEON) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
@@ -44,50 +44,82 @@
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
 #if defined(ENABLE_FP32_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
+
+#if defined(ENABLE_SVE)
 #define REGISTER_FP32_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_FP32_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
+#if defined(ENABLE_NEON)
 #define REGISTER_FP32_NEON(func_name) &(func_name)
+#else /* !defined(ENABLE_NEON) */
+#define REGISTER_FP32_NEON(func_name) nullptr
+#endif /* defined(ENABLE_NEON) */
+
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
 #define REGISTER_FP32_SVE(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_SIGNED_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) &(func_name)
+
+#if defined(ENABLE_SVE)
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) &(func_name)
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
 #define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
 #if defined(ENABLE_QASYMM8_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
-#define REGISTER_QASYMM8_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
 #define REGISTER_QASYMM8_NEON(func_name) &(func_name)
+
+#if defined(ENABLE_SVE)
+#define REGISTER_QASYMM8_SVE(func_name) &(func_name)
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
 #define REGISTER_QASYMM8_SVE(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
 #if defined(ENABLE_QSYMM16_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
-#define REGISTER_QSYMM16_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+
 #define REGISTER_QSYMM16_NEON(func_name) &(func_name)
+
+#if defined(ENABLE_SVE)
+#define REGISTER_QSYMM16_SVE(func_name) &(func_name)
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
 #define REGISTER_QSYMM16_SVE(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
 #if defined(ENABLE_INTEGER_KERNELS)
-#if defined(__ARM_FEATURE_SVE)
+
+#if defined(ENABLE_SVE)
 #define REGISTER_INTEGER_SVE(func_name) &(func_name)
-#endif /* defined(__ARM_FEATURE_SVE) */
+#else /* !defined(ENABLE_SVE) */
+#define REGISTER_INTEGER_SVE(func_name) nullptr
+#endif /* defined(ENABLE_SVE) */
+
+#if defined(ENABLE_NEON)
 #define REGISTER_INTEGER_NEON(func_name) &(func_name)
+#else /* !defined(ENABLE_NEON) */
+#define REGISTER_INTEGER_NEON(func_name) nullptr
+#endif /* defined(ENABLE_NEON) */
+
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
 #define REGISTER_INTEGER_SVE(func_name) nullptr
diff --git a/src/core/cpu/kernels/CpuActivationKernel.cpp b/src/core/cpu/kernels/CpuActivationKernel.cpp
index eb38c18cff..8a57a3b529 100644
--- a/src/core/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/core/cpu/kernels/CpuActivationKernel.cpp
@@ -60,7 +60,7 @@ struct ActivationKernel
 
 static const ActivationKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp16_sve_activation",
         [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
@@ -71,7 +71,8 @@ static const ActivationKernel available_kernels[] =
         [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_activation)
     },
-#else  /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE)  */
+#if defined(ENABLE_NEON)
     {
         "fp16_neon_activation",
         [](const ActivationSelectorData & data) { return data.dt == DataType::F16; },
@@ -82,9 +83,8 @@ static const ActivationKernel available_kernels[] =
         [](const ActivationSelectorData & data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_activation)
     },
-#endif /* defined(__ARM_FEATURE_SVE)  */
-
-#if defined(__ARM_FEATURE_SVE2) /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(ENABLE_NEON)  */
+#if defined(__ARM_FEATURE_SVE2)
     {
         "qasymm8_sve_activation",
         [](const ActivationSelectorData & data) { return data.dt == DataType::QASYMM8; },
@@ -116,7 +116,7 @@ static const ActivationKernel available_kernels[] =
         [](const ActivationSelectorData & data) { return data.dt == DataType::QSYMM16; },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::qsymm16_neon_activation)
     },
-#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(__ARM_FEATURE_SVE2)  */
 };
 
 const ActivationKernel *get_implementation(const ActivationSelectorData &data)
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
index fc88a7e22d..7afdceae38 100644
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ b/src/core/cpu/kernels/CpuAddKernel.cpp
@@ -61,7 +61,7 @@ struct AddKernel
 
 static const AddKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "add_same_sve",
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
@@ -102,7 +102,8 @@ static const AddKernel available_kernels[] =
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
         REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
     {
         "add_same_neon",
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
@@ -145,8 +146,7 @@ static const AddKernel available_kernels[] =
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon)
     },
-#endif /* defined(__ARM_FEATURE_SVE) */
-
+#endif /* defined(ENABLE_NEON) */
 #if defined(__ARM_FEATURE_SVE2)
     {
         "add_qasymm8_sve",
@@ -179,7 +179,7 @@ static const AddKernel available_kernels[] =
         [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
     },
-#endif /* defined(__ARM_FEATURE_SVE2) */
+#endif /* defined(ENABLE_NEON) */
 
 };
 
diff --git a/src/core/cpu/kernels/CpuElementwiseKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
index ddbc48feb8..643a870540 100644
--- a/src/core/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
@@ -76,28 +76,31 @@ configure_arithm_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorI
     ARM_COMPUTE_UNUSED(src1, dst);
     static ElementwiseKernel kernels[] =
     {
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, float32_t>))),
-        generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, int32_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::F32>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float32_t>))),
+        generate_kernel<DataType::S32>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int32_t>))),
+        generate_kernel<DataType::S16>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, int16_t>))),
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
         generate_kernel<DataType::F32>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>))),
         generate_kernel<DataType::S32>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 #if defined(__ARM_FEATURE_SVE2)
-        generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op<op, uint8_t>))),
-        generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_arithmetic_quantized_op<op, int8_t>))),
-#else  /* defined(__ARM_FEATURE_SVE2) */
+        generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, uint8_t>))),
+        generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_arithmetic_quantized_op<op, int8_t>))),
+#else  /* !defined(__ARM_FEATURE_SVE2) */
         generate_kernel<DataType::QASYMM8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_arithm_op_quantized<op>))),
         generate_kernel<DataType::QASYMM8_SIGNED>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_arithm_op_quantized_signed<op>))),
 #endif /* defined(__ARM_FEATURE_SVE2) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_arithmetic_op<op, float16_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::F16>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_arithmetic_op<op, float16_t>))),
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         generate_kernel<DataType::F16>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
         generate_kernel<DataType::S16>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>))),
+#endif /* defined(ENABLE_NEON) */
     };
 
     for(const auto &uk : kernels)
@@ -118,31 +121,31 @@ configure_comp_func(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInf
     ARM_COMPUTE_UNUSED(src1, dst);
     static ElementwiseKernel kernels[] =
     {
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, uint8_t>))),
-        generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, float>))),
-        generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, int16_t>))),
-        generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, int32_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, uint8_t>))),
+        generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_SVE((arm_compute::cpu::elementwise_comparison_op<op, float>))),
+        generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int16_t>))),
+        generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_SVE((arm_compute::cpu::elementwise_comparison_op<op, int32_t>))),
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
         generate_kernel<DataType::U8, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_8<op, uint8_t, uint8x16_t>))),
         generate_kernel<DataType::F32, DataType::U8>(REGISTER_FP32_NEON((arm_compute::cpu::elementwise_comp_op_32<op, float, float32x4_t>))),
         generate_kernel<DataType::S16, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_16<op, int16_t, int16x8_t>))),
         generate_kernel<DataType::S32, DataType::U8>(REGISTER_INTEGER_NEON((arm_compute::cpu::elementwise_comp_op_32<op, int32_t, int32x4_t>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 #if defined(__ARM_FEATURE_SVE2)
-        generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op<op, int8_t>))),
-        generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::sve::elementwise_comparison_quantized_op<op, uint8_t>))),
-#else  /* defined(__ARM_FEATURE_SVE2) */
+        generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, int8_t>))),
+        generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_SVE((arm_compute::cpu::elementwise_comparison_quantized_op<op, uint8_t>))),
+#else  /* !defined(__ARM_FEATURE_SVE2) */
         generate_kernel<DataType::QASYMM8_SIGNED, DataType::U8>(REGISTER_QASYMM8_SIGNED_NEON((arm_compute::cpu::elementwise_comp_op_quantized_signed<op>))),
         generate_kernel<DataType::QASYMM8, DataType::U8>(REGISTER_QASYMM8_NEON((arm_compute::cpu::elementwise_comp_op_quantized<op>))),
 #endif /* defined(__ARM_FEATURE_SVE2) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#if defined(__ARM_FEATURE_SVE)
-        generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::sve::elementwise_comparison_op<op, float16_t>))),
-#else  /* defined(__ARM_FEATURE_SVE) */
+#if defined(ENABLE_SVE)
+        generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_SVE((arm_compute::cpu::elementwise_comparison_op<op, float16_t>))),
+#endif /* defined(ENABLE_SVE)  */
+#if defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         generate_kernel<DataType::F16, DataType::U8>(REGISTER_FP16_NEON((arm_compute::cpu::elementwise_comp_op_16<op, float16_t, float16x8_t>))),
-#endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* defined(ENABLE_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     };
 
     for(const auto &uk : kernels)
diff --git a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 3a96d93c03..2600a49b70 100644
--- a/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -54,7 +54,7 @@ struct ElementwiseUnaryKernel
 
 static const ElementwiseUnaryKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp32_sve_elementwise_unary",
         [](DataType dt) { return dt == DataType::F32; },
@@ -70,7 +70,8 @@ static const ElementwiseUnaryKernel available_kernels[] =
         [](DataType dt) { return dt == DataType::S32; },
         REGISTER_INTEGER_SVE(arm_compute::cpu::elementwise_sve_op<int32_t>),
     },
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
+#if defined(ENABLE_NEON)
     {
         "fp32_neon_elementwise_unary",
         [](DataType dt) { return dt == DataType::F32; },
@@ -88,6 +89,7 @@ static const ElementwiseUnaryKernel available_kernels[] =
         [](DataType dt) { return dt == DataType::S32; },
         REGISTER_INTEGER_NEON(arm_compute::cpu::elementwise_op<int32_t>),
     },
+#endif // defined(ENABLE_NEON)
 };
 
 const ElementwiseUnaryKernel *get_implementation(DataType dt)
diff --git a/src/core/cpu/kernels/CpuScaleKernel.cpp b/src/core/cpu/kernels/CpuScaleKernel.cpp
index ed7517111f..29475fa63f 100644
--- a/src/core/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/core/cpu/kernels/CpuScaleKernel.cpp
@@ -64,38 +64,39 @@ struct ScaleKernel
 
 static const ScaleKernel available_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "fp16_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_sve_scale)
+        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
     },
     {
         "f32_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_sve_scale)
+        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
     },
     {
         "qasymm8_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_sve_scale)
+        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
     },
     {
         "qasymm8_signed_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_sve_scale)
+        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
     },
     {
         "u8_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_sve_scale)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
     },
     {
         "s16_sve_scale",
         [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_sve_scale)
+        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "common_neon_scale",
@@ -128,7 +129,7 @@ static const ScaleKernel available_kernels[] =
         [](const ScaleSelectorData & data) { return data.dt == DataType::S16; },
         REGISTER_INTEGER_NEON(arm_compute::cpu::common_neon_scale<int16_t>)
     },
-#endif /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 };
 
 /** Micro-kernel selector
diff --git a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
index d2453ed21d..8ea186b16a 100644
--- a/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/core/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -34,8 +34,8 @@
 #include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/common/Registrars.h"
-#include "src/core/cpu/kernels/softmax/impl/NEON/list.h"
-#include "src/core/cpu/kernels/softmax/impl/SVE/list.h"
+#include "src/core/cpu/kernels/softmax/impl/neon/list.h"
+#include "src/core/cpu/kernels/softmax/impl/sve/list.h"
 
 namespace arm_compute
 {
@@ -69,7 +69,7 @@ struct SoftmaxLogits1DMaxKernel
 
 static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "sve_softmax_logits_1d_float",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -80,7 +80,9 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
         REGISTER_FP16_SVE(arm_compute::cpu::sve_softmax_logits_1d_float<float16_t>)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+
+#if defined(ENABLE_NEON)
     {
         "neon_softmax_logits_1d_float",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -93,7 +95,7 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
         REGISTER_FP16_NEON(arm_compute::cpu::neon_softmax_logits_1d_float<float16_t>)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* !defined(ENABLE_NEON) */
 
 #if defined(__ARM_FEATURE_SVE2)
     {
@@ -123,7 +125,7 @@ static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
 
 static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
 {
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
     {
         "sve_logits_1d_max",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -144,7 +146,8 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_logits_1d_max<qasymm8_signed_t>)
     },
-#else /* !defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_SVE) */
+#if defined(ENABLE_NEON)
     {
         "neon_logits_1d_max",
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
@@ -167,7 +170,7 @@ static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
         [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_logits_1d_max<qasymm8_signed_t>)
     },
-#endif /* defined(__ARM_FEATURE_SVE) */
+#endif /* defined(ENABLE_NEON) */
 };
 
 const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
diff --git a/src/core/cpu/kernels/activation/sve/fp16.cpp b/src/core/cpu/kernels/activation/sve/fp16.cpp
index bf31fd7d93..e4be1a4faa 100644
--- a/src/core/cpu/kernels/activation/sve/fp16.cpp
+++ b/src/core/cpu/kernels/activation/sve/fp16.cpp
@@ -28,7 +28,6 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
 #include <arm_sve.h>
 
@@ -126,5 +125,4 @@ void fp16_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayer
     input, output);
 }
 } // namespace cpu
-} // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/activation/sve/fp32.cpp b/src/core/cpu/kernels/activation/sve/fp32.cpp
index 75f9f8a4c3..f797944435 100644
--- a/src/core/cpu/kernels/activation/sve/fp32.cpp
+++ b/src/core/cpu/kernels/activation/sve/fp32.cpp
@@ -29,7 +29,6 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -127,5 +126,4 @@ void fp32_sve_activation(const ITensor *src, ITensor *dst, const ActivationLayer
     input, output);
 }
 } // namespace cpu
-} // namespace arm_compute
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.cpp b/src/core/cpu/kernels/add/sve/impl.cpp
new file mode 100644
index 0000000000..d1660fe19e
--- /dev/null
+++ b/src/core/cpu/kernels/add/sve/impl.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/cpu/kernels/add/sve/impl.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+{
+    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
+    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
+
+    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
+    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
+    Iterator output(dst, window);
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+
+            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+                svst1(pg, output_ptr + x, res);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(src0, input1_win);
+        Iterator input2(src1, input2_win);
+        Iterator output(dst, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto val1 = svld1(pg, input1_ptr + x);
+                const auto val2 = svld1(pg, input2_ptr + x);
+                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+                svst1(pg, output_ptr + x, res);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        input1, input2, output);
+    }
+}
+
+template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/impl.h b/src/core/cpu/kernels/add/sve/impl.h
new file mode 100644
index 0000000000..c38b1d47e0
--- /dev/null
+++ b/src/core/cpu/kernels/add/sve/impl.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+#define SRC_CORE_SVE_KERNELS_ADD_IMPL_H
+
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif // defined(ENABLE_SVE)
+#endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp
index ae74bfa3eb..6dec140499 100644
--- a/src/core/cpu/kernels/add/sve/integer.cpp
+++ b/src/core/cpu/kernels/add/sve/integer.cpp
@@ -25,9 +25,8 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -197,5 +196,4 @@ void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst,
     add_s16_u8_s16_sve(src1, src0, dst, policy, window);
 }
 } // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h
index 71dd875ad8..aebb43bb60 100644
--- a/src/core/cpu/kernels/add/sve/list.h
+++ b/src/core/cpu/kernels/add/sve/list.h
@@ -24,11 +24,12 @@
 #ifndef SRC_CORE_SVE_KERNELS_ADD_LIST_H
 #define SRC_CORE_SVE_KERNELS_ADD_LIST_H
 
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/cpu/kernels/add/sve/impl.h"
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -47,99 +48,7 @@ DECLARE_ADD_KERNEL(add_u8_u8_s16_sve);
 
 #undef DECLARE_ADD_KERNEL
 
-template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    const auto all_true_pg           = wrapper::svptrue<ScalarType>();
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
-    const bool is_sat                = (policy == ConvertPolicy::SATURATE);
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    Iterator input1(src0, window.broadcast_if_dimension_le_one(src0->info()->tensor_shape()));
-    Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
-    Iterator output(dst, window);
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(src0, input1_win);
-        Iterator input2(src1, input2_win);
-        Iterator output(dst, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
-            {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
 #endif // SRC_CORE_SVE_KERNELS_ADD_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
new file mode 100644
index 0000000000..2c3bb0ff7c
--- /dev/null
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise.cpp
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+using namespace arm_compute::wrapper;
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+struct LoopArguments
+{
+    OperatorType           op;
+    const InputScalarType *input1_ptr;
+    const InputScalarType *input2_ptr;
+    OutputScalarType      *output_ptr;
+};
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+struct BroadcastLoopArguments
+{
+    OperatorType           op;
+    const InputScalarType *input1_ptr;
+    InputScalarType        broadcast_value;
+    OutputScalarType      *output_ptr;
+    bool                   reorder;
+};
+
+template <typename InputScalarType, typename OutputScalarType>
+void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
+{
+    const auto in1 = svld1(pg, args.input1_ptr);
+    const auto in2 = svld1(pg, args.input2_ptr);
+    const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
+    svst1(pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType>
+void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
+{
+    const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
+    const auto broadcast_vector     = svdup_n(args.broadcast_value);
+    const auto in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
+    const auto in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
+    const auto res                  = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
+    svst1(pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType>
+void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
+{
+    const auto     in1       = svld1(pg, args.input1_ptr);
+    const auto     in2       = svld1(pg, args.input2_ptr);
+    const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
+    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+    svst1(output_pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType>
+void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
+{
+    const auto     non_broadcast_vector = svld1(pg, args.input1_ptr);
+    const auto     broadcast_vector     = svdup_n(args.broadcast_value);
+    const auto     in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
+    const auto     in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
+    const auto     res                  = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
+    const svbool_t output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+    svst1(output_pg, args.output_ptr, res);
+}
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
+
+template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
+using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
+
+template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
+          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
+          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
+void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+                    OperatorType op,
+                    LoopFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
+                    BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
+{
+    const auto all_true_pg = svptrue<InputScalarType>();
+
+    // Create input windows
+    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
+    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
+
+    // Clear X Dimension on execution window as we handle manually
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    const auto window_start_x        = static_cast<int>(window.x().start());
+    const auto window_end_x          = static_cast<int>(window.x().end());
+    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
+
+    if(is_broadcast_across_x)
+    {
+        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
+        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
+        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
+        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
+        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
+
+        // Clear X Dimension on execution window as we handle manually
+        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator broadcast_input(broadcast_tensor, broadcast_win);
+        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+            int x = window_start_x;
+
+            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+            do
+            {
+                broadcast_func(pg,
+                {
+                    op,
+                    non_broadcast_input_ptr + x,
+                    broadcast_value,
+                    output_ptr + x,
+                    !is_broadcast_input_2
+                });
+                x += svcnt<InputScalarType>();
+                pg = svwhilelt<InputScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        broadcast_input, non_broadcast_input, output);
+    }
+    else
+    {
+        // Clear X Dimension on execution window as we handle manually
+        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+        Iterator input1(in1, input1_win);
+        Iterator input2(in2, input2_win);
+        Iterator output(out, win);
+
+        execute_window_loop(win, [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+            int x = window_start_x;
+
+            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+            do
+            {
+                func(pg,
+                {
+                    op,
+                    input1_ptr + x,
+                    input2_ptr + x,
+                    output_ptr + x
+                });
+                x += svcnt<InputScalarType>();
+                pg = svwhilelt<InputScalarType>(x, window_end_x);
+            }
+            while(svptest_any(all_true_pg, pg));
+        },
+        input1, input2, output);
+    }
+}
+
+template <ArithmeticOperation op, typename ScalarType>
+void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    using VectorType = typename sve_vector<ScalarType>::type;
+
+    elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
+                                                                &arithmetic_op_loop<ScalarType, ScalarType>,
+                                                                &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
+}
+
+template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
+void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+{
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    using InputVectorType  = typename sve_vector<InputScalarType>::type;
+    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+
+    elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
+                                                                           &comparison_op_loop<InputScalarType, OutputScalarType>,
+                                                                           &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
+}
+
+template <>
+svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+{
+    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+}
+
+template <>
+svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+{
+    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+}
+
+template <>
+svint16_t elementwise_div<svint16_t>(svbool_t &pg, const svint16_t &a, const svint16_t &b)
+{
+    ARM_COMPUTE_UNUSED(pg, a, b);
+    ARM_COMPUTE_ERROR("Not supported");
+}
+
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MAX, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::MIN, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::SQUARED_DIFF, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::PRELU, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::DIV, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_arithmetic_op<ArithmeticOperation::POWER, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Equal, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Equal, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::NotEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Greater, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Greater, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::GreaterEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::Less, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::Less, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void elementwise_comparison_op<ComparisonOperation::LessEqual, uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
index 83c3355de4..a92a8648a8 100644
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
@@ -23,50 +23,62 @@
  */
 #ifndef SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/svtraits.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-namespace sve
-{
 using namespace arm_compute::wrapper;
 
 template <typename VectorType>
-inline VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
+VectorType elementwise_pow(svbool_t &pg, const VectorType &a, const VectorType &b)
 {
     return svpow_z(pg, a, b);
 }
 
-template <>
-inline svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
-{
-    return svcvt_s32_z(pg, svpow_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
-}
-
 template <typename VectorType>
-inline VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
+VectorType elementwise_div(svbool_t &pg, const VectorType &a, const VectorType &b)
 {
     return svdiv_z(pg, a, b);
 }
 
-template <>
-inline svint32_t elementwise_div<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)
+template <uint32_t bytewidth>
+svbool_t narrow_to_byte_predicate(svbool_t pg)
 {
-    return svcvt_s32_z(pg, svdiv_z(pg, svcvt_f32_z(pg, a), svcvt_f32_z(pg, b)));
+    const auto all_false = svpfalse();
+
+    switch(bytewidth)
+    {
+        case 8:
+            pg = svuzp1_b32(pg, all_false);
+        /* fall through */
+        case 4:
+            pg = svuzp1_b16(pg, all_false);
+        /* fall through */
+        case 2:
+            pg = svuzp1_b8(pg, all_false);
+        /* fall through */
+        default:
+            break;
+    }
+    return pg;
 }
 
 template <typename VectorType>
-inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
+VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, const VectorType &b, ArithmeticOperation op)
 {
-    using ScalarType = typename sve_scalar<VectorType>::type;
+    using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
     VectorType res{};
 
     switch(op)
@@ -108,30 +120,8 @@ inline VectorType elementwise_arithmetic_op(svbool_t &pg, const VectorType &a, c
     return res;
 }
 
-template <uint32_t bytewidth>
-inline svbool_t narrow_to_byte_predicate(svbool_t pg)
-{
-    const auto all_false = svpfalse();
-
-    switch(bytewidth)
-    {
-        case 8:
-            pg = svuzp1_b32(pg, all_false);
-        /* fall through */
-        case 4:
-            pg = svuzp1_b16(pg, all_false);
-        /* fall through */
-        case 2:
-            pg = svuzp1_b8(pg, all_false);
-        /* fall through */
-        default:
-            break;
-    }
-    return pg;
-}
-
 template <typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
 {
     svbool_t selection_vector{};
 
@@ -159,10 +149,10 @@ inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVecto
             ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
     }
 
-    using InputScalarType = typename sve_scalar<InputVectorType>::type;
+    using InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type;
     selection_vector      = narrow_to_byte_predicate<sizeof(InputScalarType)>(selection_vector);
 
-    using OutputScalarType  = typename sve_scalar<OutputVectorType>::type;
+    using OutputScalarType  = typename wrapper::sve_scalar<OutputVectorType>::type;
     const auto false_vector = svdup_n(static_cast<OutputScalarType>((uint32_t)0));
     const auto true_vector  = svdup_n(static_cast<OutputScalarType>(~(uint32_t)0));
     auto       ret          = svsel(selection_vector, true_vector, false_vector);
@@ -170,197 +160,12 @@ inline OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVecto
     return ret;
 }
 
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct LoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    const InputScalarType *input2_ptr;
-    OutputScalarType      *output_ptr;
-};
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-struct BroadcastLoopArguments
-{
-    OperatorType           op;
-    const InputScalarType *input1_ptr;
-    InputScalarType        broadcast_value;
-    OutputScalarType      *output_ptr;
-    bool                   reorder;
-};
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto in1 = svld1(pg, args.input1_ptr);
-    const auto in2 = svld1(pg, args.input2_ptr);
-    const auto res = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void arithmetic_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args)
-{
-    const auto non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto res                  = elementwise_arithmetic_op<typename sve_vector<InputScalarType>::type>(pg, in1, in2, args.op);
-    svst1(pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_loop(svbool_t pg, const LoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     in1       = svld1(pg, args.input1_ptr);
-    const auto     in2       = svld1(pg, args.input2_ptr);
-    const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType>
-inline void comparison_op_broadcast_loop(svbool_t pg, const BroadcastLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args)
-{
-    const auto     non_broadcast_vector = svld1(pg, args.input1_ptr);
-    const auto     broadcast_vector     = svdup_n(args.broadcast_value);
-    const auto     in1                  = args.reorder ? broadcast_vector : non_broadcast_vector;
-    const auto     in2                  = args.reorder ? non_broadcast_vector : broadcast_vector;
-    const auto     res                  = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, args.op);
-    const svbool_t output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-    svst1(output_pg, args.output_ptr, res);
-}
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using LoopFuncType = void (*)(svbool_t, const LoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
-using BroadcastLoopFuncType = void (*)(svbool_t, const BroadcastLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
-
-template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OperatorType op,
-                    LoopFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
-                    BroadcastLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func)
-{
-    const auto all_true_pg = svptrue<InputScalarType>();
-
-    // Create input windows
-    Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const auto window_start_x        = static_cast<int>(window.x().start());
-    const auto window_end_x          = static_cast<int>(window.x().end());
-    const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
-
-    if(is_broadcast_across_x)
-    {
-        const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
-        Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
-        Window         non_broadcast_win    = !is_broadcast_input_2 ? input2_win : input1_win;
-        const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
-        const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
-
-        // Clear X Dimension on execution window as we handle manually
-        non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator broadcast_input(broadcast_tensor, broadcast_win);
-        Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                broadcast_func(pg,
-                {
-                    op,
-                    non_broadcast_input_ptr + x,
-                    broadcast_value,
-                    output_ptr + x,
-                    !is_broadcast_input_2
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
-    }
-    else
-    {
-        // Clear X Dimension on execution window as we handle manually
-        input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-        input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-        Iterator input1(in1, input1_win);
-        Iterator input2(in2, input2_win);
-        Iterator output(out, win);
-
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
-            {
-                func(pg,
-                {
-                    op,
-                    input1_ptr + x,
-                    input2_ptr + x,
-                    output_ptr + x
-                });
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
-    }
-}
-
 template <ArithmeticOperation op, typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    using VectorType = typename sve_vector<ScalarType>::type;
-
-    elementwise_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
-                                                                &arithmetic_op_loop<ScalarType, ScalarType>,
-                                                                &arithmetic_op_broadcast_loop<ScalarType, ScalarType>);
-}
-
-template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
-
-    elementwise_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
-                                                                           &comparison_op_loop<InputScalarType, OutputScalarType>,
-                                                                           &comparison_op_broadcast_loop<InputScalarType, OutputScalarType>);
-}
+void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
 
-} // namespace sve
+template <ComparisonOperation op, typename ScalarType, typename OutputScalarType = uint8_t>
+void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
 #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
index b6342c727c..6c5524e284 100644
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
@@ -26,14 +26,13 @@
 
 #if defined(__ARM_FEATURE_SVE2)
 
+#include "src/core/NEON/wrapper/svtraits.h"
 #include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-namespace sve
-{
 using namespace arm_compute::wrapper;
 
 template <typename InputScalarType, typename OutputScalarType, typename OperatorType>
@@ -176,7 +175,7 @@ inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArgumen
     const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale);
     const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale);
 
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
 
     const auto result = svcreate4(
                             elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op),
@@ -200,7 +199,7 @@ inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQ
     const auto &af = args.reorder ? in2 : in1;
     const auto &bf = args.reorder ? in1 : in2;
 
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
 
     const auto result = svcreate4(
                             elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op),
@@ -221,8 +220,8 @@ template <typename InputScalarType, typename OutputScalarType, typename Operator
 using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &);
 
 template <typename InputVectorType, typename OutputVectorType, typename OperatorType,
-          typename InputScalarType  = typename sve_scalar<InputVectorType>::type,
-          typename OutputScalarType = typename sve_scalar<OutputVectorType>::type>
+          typename InputScalarType  = typename wrapper::sve_scalar<InputVectorType>::type,
+          typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type>
 void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
                               OperatorType op,
                               LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType>          func,
@@ -344,7 +343,7 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o
 template <ArithmeticOperation op, typename ScalarType>
 void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    using VectorType = typename sve_vector<ScalarType>::type;
+    using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type;
     elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op,
                                                                           &arithmetic_op_quantized_loop<ScalarType, ScalarType>,
                                                                           &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>);
@@ -354,14 +353,12 @@ template <ComparisonOperation op, typename InputScalarType, typename OutputScala
 void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
-    using InputVectorType  = typename sve_vector<InputScalarType>::type;
-    using OutputVectorType = typename sve_vector<OutputScalarType>::type;
+    using InputVectorType  = typename wrapper::traits::sve_vector<InputScalarType>::type;
+    using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
     elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op,
                                                                                      &comparison_op_quantized_loop<InputScalarType, OutputScalarType>,
                                                                                      &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>);
 }
-
-} // namespace sve
 } // namespace cpu
 } // namespace arm_compute
 
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
new file mode 100644
index 0000000000..cb58548f0b
--- /dev/null
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType, typename VectorType>
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+{
+    switch(op)
+    {
+        case ElementWiseUnary::RSQRT:
+            return svinvsqrt(pg, a);
+        case ElementWiseUnary::EXP:
+            return wrapper::svexp_z(pg, a);
+        case ElementWiseUnary::NEG:
+            return svneg_z(pg, a);
+        case ElementWiseUnary::LOG:
+            return wrapper::svlog_z(pg, a);
+        case ElementWiseUnary::ABS:
+            return svabs_z(pg, a);
+        case ElementWiseUnary::ROUND:
+            return svrintn_z(pg, a);
+        case ElementWiseUnary::SIN:
+            return wrapper::svsin_z(pg, a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
+    }
+}
+
+template <typename ScalarType, typename VectorType>
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+{
+    switch(op)
+    {
+        case ElementWiseUnary::NEG:
+            return svneg_z(pg, a);
+        case ElementWiseUnary::ABS:
+            return svabs_z(pg, a);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
+    }
+}
+
+template <typename ScalarType>
+void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
+{
+    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win = window;
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+        int        x          = window_start_x;
+
+        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        do
+        {
+            const auto vin = svld1(pg, input_ptr + x);
+            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+            x += wrapper::svcnt<ScalarType>();
+            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        }
+        while(svptest_any(all_true_pg, pg));
+    },
+    input, output);
+}
+
+template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+template void elementwise_sve_op<float32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+template void elementwise_sve_op<int32_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
index 23502c71e5..63490421e9 100644
--- a/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_unary_list.h
@@ -25,87 +25,15 @@
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_UNARY_LIST_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#if defined(__ARM_FEATURE_SVE)
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
+#if defined(ENABLE_SVE)
 
 namespace arm_compute
 {
 namespace cpu
 {
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::RSQRT:
-            return svinvsqrt(pg, a);
-        case ElementWiseUnary::EXP:
-            return wrapper::svexp_z(pg, a);
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::LOG:
-            return wrapper::svlog_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        case ElementWiseUnary::ROUND:
-            return svrintn_z(pg, a);
-        case ElementWiseUnary::SIN:
-            return wrapper::svsin_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
-template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
-{
-    switch(op)
-    {
-        case ElementWiseUnary::NEG:
-            return svneg_z(pg, a);
-        case ElementWiseUnary::ABS:
-            return svabs_z(pg, a);
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED");
-    }
-}
-
 template <typename ScalarType>
-void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win = window;
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
-}
-
+void elementwise_sve_op(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);
 } // namespace cpu
 } // namespace arm_compute
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
 #endif // SRC_CORE_NEON_KERNELS_ELEMENTWISE_UNARY_LIST_H
\ No newline at end of file
diff --git a/src/core/cpu/kernels/floor/NEON/fp16.cpp b/src/core/cpu/kernels/floor/NEON/fp16.cpp
deleted file mode 100644
index f362676a36..0000000000
--- a/src/core/cpu/kernels/floor/NEON/fp16.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 8;
-
-void fp16_neon_floor(const void *src, void *dst, int len)
-{
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
-
-    auto psrc = static_cast<const __fp16 *>(src);
-    auto pdst = static_cast<__fp16 *>(dst);
-
-    for(; len >= step; len -= step)
-    {
-        vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
-        psrc += step;
-        pdst += step;
-    }
-
-    for(; len > 0; --len)
-    {
-        *pdst = std::floor(*psrc);
-        ++psrc;
-        ++pdst;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/floor/NEON/fp32.cpp b/src/core/cpu/kernels/floor/NEON/fp32.cpp
deleted file mode 100644
index f5efb2e849..0000000000
--- a/src/core/cpu/kernels/floor/NEON/fp32.cpp
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/common/utils/Validate.h"
-#include "src/core/NEON/NEMath.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-namespace arm_compute
-{
-namespace cpu
-{
-constexpr int step = 4;
-
-void fp32_neon_floor(const void *src, void *dst, int len)
-{
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
-    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
-    ARM_COMPUTE_ASSERT(len >= 0);
-
-    auto psrc = static_cast<const float *>(src);
-    auto pdst = static_cast<float *>(dst);
-
-    for(; len >= step; len -= step)
-    {
-        vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
-        psrc += step;
-        pdst += step;
-    }
-
-    for(; len > 0; --len)
-    {
-        *pdst = std::floor(*psrc);
-        ++pdst;
-        ++psrc;
-    }
-}
-} // namespace cpu
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/floor/neon/fp16.cpp b/src/core/cpu/kernels/floor/neon/fp16.cpp
new file mode 100644
index 0000000000..f362676a36
--- /dev/null
+++ b/src/core/cpu/kernels/floor/neon/fp16.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+#include "src/common/utils/Validate.h"
+#include "src/core/NEON/NEMath.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr int step = 8;
+
+void fp16_neon_floor(const void *src, void *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    auto psrc = static_cast<const __fp16 *>(src);
+    auto pdst = static_cast<__fp16 *>(dst);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
+        psrc += step;
+        pdst += step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *pdst = std::floor(*psrc);
+        ++psrc;
+        ++pdst;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/core/cpu/kernels/floor/neon/fp32.cpp b/src/core/cpu/kernels/floor/neon/fp32.cpp
new file mode 100644
index 0000000000..f5efb2e849
--- /dev/null
+++ b/src/core/cpu/kernels/floor/neon/fp32.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/common/utils/Validate.h"
+#include "src/core/NEON/NEMath.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr int step = 4;
+
+void fp32_neon_floor(const void *src, void *dst, int len)
+{
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
+    ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
+    ARM_COMPUTE_ASSERT(len >= 0);
+
+    auto psrc = static_cast<const float *>(src);
+    auto pdst = static_cast<float *>(dst);
+
+    for(; len >= step; len -= step)
+    {
+        vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
+        psrc += step;
+        pdst += step;
+    }
+
+    for(; len > 0; --len)
+    {
+        *pdst = std::floor(*psrc);
+        ++pdst;
+        ++psrc;
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/core/cpu/kernels/scale/sve/fp16.cpp b/src/core/cpu/kernels/scale/sve/fp16.cpp
index 99f08dbdf9..5b9377c6e6 100644
--- a/src/core/cpu/kernels/scale/sve/fp16.cpp
+++ b/src/core/cpu/kernels/scale/sve/fp16.cpp
@@ -21,6 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -30,12 +32,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -173,4 +173,4 @@ void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/fp32.cpp b/src/core/cpu/kernels/scale/sve/fp32.cpp
index 94055ae953..05fbedf20d 100644
--- a/src/core/cpu/kernels/scale/sve/fp32.cpp
+++ b/src/core/cpu/kernels/scale/sve/fp32.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -33,7 +34,6 @@
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -171,4 +171,4 @@ void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, co
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/integer.cpp b/src/core/cpu/kernels/scale/sve/integer.cpp
index 2a724ece31..d7e270c661 100644
--- a/src/core/cpu/kernels/scale/sve/integer.cpp
+++ b/src/core/cpu/kernels/scale/sve/integer.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -30,12 +31,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -298,4 +297,4 @@ void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, con
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
index c041f14b22..f747037938 100644
--- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -31,12 +32,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -90,8 +89,8 @@ void qasymm8_sve_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor
                                 bool align_corners, const Window &window)
 {
     // Data layout is NHWC
-    const int        idx_width   = 1;
-    const int        idx_height  = 2;
+    const int idx_width  = 1;
+    const int idx_height = 2;
 
     // Compute the ratio between source height and destination height
     const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
@@ -205,4 +204,4 @@ void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets,
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif //  defined(ENABLE_SVE)
\ No newline at end of file
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
index 9df4301fe3..584ec7a0da 100644
--- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#if defined(ENABLE_SVE)
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
@@ -31,12 +32,10 @@
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#if defined(__ARM_FEATURE_SVE)
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
@@ -90,8 +89,8 @@ void qasymm8_signed_sve_scale_bilinear(const ITensor *src, ITensor *dst, const I
                                        bool align_corners, const Window &window)
 {
     // Data layout is NHWC
-    const int        idx_width   = 1;
-    const int        idx_height  = 2;
+    const int idx_width  = 1;
+    const int idx_height = 2;
 
     // Compute the ratio between source height and destination height
     const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
@@ -205,4 +204,4 @@ void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *o
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // __ARM_FEATURE_SVE
\ No newline at end of file
+#endif // ENABLE_SVE
\ No newline at end of file
diff --git a/src/core/cpu/kernels/softmax/impl/NEON/list.h b/src/core/cpu/kernels/softmax/impl/NEON/list.h
deleted file mode 100644
index 5ebee31272..0000000000
--- a/src/core/cpu/kernels/softmax/impl/NEON/list.h
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
-
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "support/SaturateCast.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename T>
-void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int window_step_x  = 16 / sizeof(T);
-    const auto    window_start_x = static_cast<int>(window.x().start());
-    const auto    window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-        int  x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto current_value = wrapper::vloadq(in_ptr + x);
-            vec_max                  = wrapper::vmax(vec_max, current_value);
-        }
-        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
-
-        for(int i = 0; i < sum_stages; ++i)
-        {
-            carry_max = wrapper::vpmax(carry_max, carry_max);
-        }
-        T max_val = wrapper::vgetlane(carry_max, 0);
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-        }
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
-{
-    static_assert(std::is_same<T, qasymm8_t>::value
-                  || std::is_same<T, qasymm8_signed_t>::value,
-                  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
-
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
-
-    Iterator      in_it(in, window);
-    Iterator      max_it(max, window);
-    Iterator      out_it(out, window);
-    constexpr int vec_size = 16;
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-        float sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
-
-            /* Init sum to zero */
-            float32x4x4_t vec_sum =
-            {
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-            };
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
-
-                if(is_log)
-                {
-                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
-                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
-                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
-                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
-                }
-                else
-                {
-                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
-                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
-                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
-                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
-                }
-
-                vst4q_f32(tmp_ptr + x, vec_elements_flt);
-            }
-
-            /* Reduce sum */
-            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
-            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
-            sum_res                = vpadd_f32(sum_res, sum_res);
-            sum                    = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                float element{};
-                if(is_log)
-                {
-                    element = (max_val - in_ptr[x]) * scale_beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
-                    sum += element;
-                }
-
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = 256.f / sum;
-            }
-            else
-            {
-                sum = std::log(sum);
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                int_vec_type  normalized_value{};
-                if(is_log)
-                {
-                    const float32x4x4_t sub =
-                    {
-                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
-                    };
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
-                }
-                else
-                {
-                    float32x4x4_t mul =
-                    {
-                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
-                    };
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
-                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
-                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
-                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
-                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
-                    }
-
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
-                }
-                else
-                {
-                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
-
-    constexpr int vec_size   = 16 / sizeof(T);
-    const int     sum_stages = log2(vec_size / 2);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        T sum{};
-        T sum_inversed{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements = wrapper::vloadq(in_ptr + x);
-                vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
-                }
-                wrapper::vstore(tmp_ptr + x, vec_elements);
-            }
-
-            /* Reduce sum */
-            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
-            for(int i = 0; i < sum_stages; ++i)
-            {
-                sum_res = wrapper::vpadd(sum_res, sum_res);
-            }
-            sum = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                T element{};
-
-                if(is_log)
-                {
-                    element = (in_ptr[x] - max_val) * beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((in_ptr[x] - max_val) * beta);
-                    sum += element;
-                }
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = T(1) / sum;
-            }
-            else
-            {
-                sum = static_cast<T>(std::log(sum));
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                if(is_log)
-                {
-                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
-                }
-                else
-                {
-                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = tmp_ptr[x] - sum;
-                }
-                else
-                {
-                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
-                }
-            }
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-
-#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/softmax/impl/SVE/list.h b/src/core/cpu/kernels/softmax/impl/SVE/list.h
deleted file mode 100644
index d558d7d193..0000000000
--- a/src/core/cpu/kernels/softmax/impl/SVE/list.h
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
-
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-template <typename ScalarType>
-void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
-{
-    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win{ window };
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    Iterator input(in, win);
-    Iterator output(out, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
-
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
-        {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
-
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-
-        auto max_val = svmaxv(all_true_pg, vec_max);
-
-        *out_ptr = max_val;
-    },
-    input, output);
-}
-
-#if defined(__ARM_FEATURE_SVE2)
-template <typename ScalarType>
-void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                     ITensor *out, float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
-    const auto  scale_beta_vec = svdup_n_f32(scale_beta);
-
-    Iterator   in_it(in, window);
-    Iterator   max_it(max, window);
-    Iterator   out_it(out, window);
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-    using SVEType          = typename wrapper::traits::sve_vector<ScalarType>::type;
-
-    const int inc_1 = static_cast<int>(svcntw());
-    const int inc_2 = static_cast<int>(2 * svcntw());
-    const int inc_3 = static_cast<int>(3 * svcntw());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum_0 = svdup_n_f32(0.f);
-            auto vec_sum_1 = svdup_n_f32(0.f);
-            auto vec_sum_2 = svdup_n_f32(0.f);
-            auto vec_sum_3 = svdup_n_f32(0.f);
-
-            /* Loop over row and compute exponentials and sum */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_max, vec_elements);
-
-                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements)));
-                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements)));
-                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements)));
-                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements)));
-
-                if(is_log)
-                {
-                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
-                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
-                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
-                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
-                }
-                else
-                {
-                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
-                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
-                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
-                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
-                }
-
-                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
-                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
-                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
-                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
-            sum                = svaddv_f32(all_true_pg, vec_sum);
-
-            /* Run remaining elements */
-            x = 0;
-            if(is_log)
-            {
-                sum = std::log(sum);
-            }
-            else
-            {
-                sum = 256.f / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
-                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
-                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
-                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
-                svfloat32_t res_0{};
-                svfloat32_t res_1{};
-                svfloat32_t res_2{};
-                svfloat32_t res_3{};
-
-                if(is_log)
-                {
-                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-                }
-                else
-                {
-                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = svdup_n_f32(128.f);
-                        res_0                 = svsub_z(pg_0, vec_in_0, offset_vec);
-                        res_1                 = svsub_z(pg_1, vec_in_1, offset_vec);
-                        res_2                 = svsub_z(pg_2, vec_in_2, offset_vec);
-                        res_3                 = svsub_z(pg_3, vec_in_3, offset_vec);
-                    }
-                }
-
-                // Store value
-                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
-                svst1(pg, out_ptr + x, out);
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-#endif /* defined(__ARM_FEATURE_SVE2) */
-
-template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
-{
-    const int start_x     = in->info()->valid_region().anchor.x();
-    const int input_width = in->info()->valid_region().shape.x();
-
-    Iterator in_it(in, window);
-    Iterator max_it(max, window);
-    Iterator out_it(out, window);
-
-    const auto all_true_pg = wrapper::svptrue<ScalarType>();
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
-        {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
-
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
-
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svsub_z(pg, vec_elements, vec_max);
-                if(is_log)
-                {
-                    vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
-                    vec_sum      = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
-                }
-                else
-                {
-                    vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
-                }
-                svst1(pg, tmp_ptr + x, vec_elements);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                else
-                {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
-                }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
-}
-
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
-
-#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/softmax/impl/neon/list.h b/src/core/cpu/kernels/softmax/impl/neon/list.h
new file mode 100644
index 0000000000..5ebee31272
--- /dev/null
+++ b/src/core/cpu/kernels/softmax/impl/neon/list.h
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+#define SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H
+
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "support/SaturateCast.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename T>
+void neon_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    constexpr int window_step_x  = 16 / sizeof(T);
+    const auto    window_start_x = static_cast<int>(window.x().start());
+    const auto    window_end_x   = static_cast<int>(window.x().end());
+
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    const int sum_stages = log2(window_step_x / 2);
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        // Get pointers
+        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
+
+        // Init max value
+        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+        int  x       = window_start_x;
+
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto current_value = wrapper::vloadq(in_ptr + x);
+            vec_max                  = wrapper::vmax(vec_max, current_value);
+        }
+        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+
+        for(int i = 0; i < sum_stages; ++i)
+        {
+            carry_max = wrapper::vpmax(carry_max, carry_max);
+        }
+        T max_val = wrapper::vgetlane(carry_max, 0);
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+        }
+
+        *out_ptr = max_val;
+    },
+    input, output);
+}
+
+template <typename T>
+void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
+                                      ITensor *out, float beta, bool is_log, const Window &window)
+{
+    static_assert(std::is_same<T, qasymm8_t>::value
+                  || std::is_same<T, qasymm8_signed_t>::value,
+                  "quantized type should be either qasymm8_t or qasymm8_signed_t.");
+
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const auto  scale_beta_vec = vdupq_n_f32(scale_beta);
+
+    Iterator      in_it(in, window);
+    Iterator      max_it(max, window);
+    Iterator      out_it(out, window);
+    constexpr int vec_size = 16;
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+
+        float sum{};
+        float sum_inversed{};
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+
+            /* Init sum to zero */
+            float32x4x4_t vec_sum =
+            {
+                vdupq_n_f32(0.f),
+                vdupq_n_f32(0.f),
+                vdupq_n_f32(0.f),
+                vdupq_n_f32(0.f),
+            };
+
+            /* Loop over row and compute exponentials and sum */
+            int x = 0;
+            for(; x <= (input_width - vec_size); x += vec_size)
+            {
+                auto vec_elements     = wrapper::vloadq(in_ptr + x);
+                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
+                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+
+                if(is_log)
+                {
+                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                }
+                else
+                {
+                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
+                }
+
+                vst4q_f32(tmp_ptr + x, vec_elements_flt);
+            }
+
+            /* Reduce sum */
+            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+            sum_res                = vpadd_f32(sum_res, sum_res);
+            sum                    = wrapper::vgetlane(sum_res, 0);
+
+            /* Run remaining elements */
+            for(; x < input_width; ++x)
+            {
+                float element{};
+                if(is_log)
+                {
+                    element = (max_val - in_ptr[x]) * scale_beta;
+                    sum += std::exp(element);
+                }
+                else
+                {
+                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
+                    sum += element;
+                }
+
+                tmp_ptr[x] = element;
+            }
+
+            if(!is_log)
+            {
+                sum_inversed = 256.f / sum;
+            }
+            else
+            {
+                sum = std::log(sum);
+            }
+        }
+
+        /* Normalize exponentials */
+        {
+            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+            /* Loop over row and compute softmax */
+            int x = 0;
+            for(; x <= (input_width - vec_size); x += vec_size)
+            {
+                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
+                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+                int_vec_type  normalized_value{};
+                if(is_log)
+                {
+                    const float32x4x4_t sub =
+                    {
+                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
+                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
+                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
+                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                    };
+                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                }
+                else
+                {
+                    float32x4x4_t mul =
+                    {
+                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
+                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
+                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
+                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                    };
+
+                    if(is_qasymm8_signed)
+                    {
+                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
+                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
+                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
+                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                    }
+
+                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                }
+                wrapper::vstore(out_ptr + x, normalized_value);
+            }
+            /* Run remaining elements */
+            for(; x < input_width; ++x)
+            {
+                if(is_log)
+                {
+                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                }
+                else
+                {
+                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
+                }
+            }
+        }
+    },
+    in_it, max_it, out_it);
+}
+
+template <typename T>
+void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
+                                  ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator max_it(max, window);
+    Iterator out_it(out, window);
+
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+
+    constexpr int vec_size   = 16 / sizeof(T);
+    const int     sum_stages = log2(vec_size / 2);
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
+
+        T sum{};
+        T sum_inversed{};
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
+
+            /* Init sum to zero */
+            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+            /* Loop over row and compute exponentials and sum */
+            int x = 0;
+            for(; x <= (input_width - vec_size); x += vec_size)
+            {
+                auto vec_elements = wrapper::vloadq(in_ptr + x);
+                vec_elements      = wrapper::vsub(vec_elements, vec_max);
+                if(is_log)
+                {
+                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                }
+                else
+                {
+                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
+                }
+                wrapper::vstore(tmp_ptr + x, vec_elements);
+            }
+
+            /* Reduce sum */
+            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+            for(int i = 0; i < sum_stages; ++i)
+            {
+                sum_res = wrapper::vpadd(sum_res, sum_res);
+            }
+            sum = wrapper::vgetlane(sum_res, 0);
+
+            /* Run remaining elements */
+            for(; x < input_width; ++x)
+            {
+                T element{};
+
+                if(is_log)
+                {
+                    element = (in_ptr[x] - max_val) * beta;
+                    sum += std::exp(element);
+                }
+                else
+                {
+                    element = std::exp((in_ptr[x] - max_val) * beta);
+                    sum += element;
+                }
+                tmp_ptr[x] = element;
+            }
+
+            if(!is_log)
+            {
+                sum_inversed = T(1) / sum;
+            }
+            else
+            {
+                sum = static_cast<T>(std::log(sum));
+            }
+        }
+
+        /* Normalize exponentials */
+        {
+            /* Loop over row and compute softmax */
+            int x = 0;
+            for(; x <= (input_width - vec_size); x += vec_size)
+            {
+                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
+                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+                if(is_log)
+                {
+                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                }
+                else
+                {
+                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                }
+                wrapper::vstore(out_ptr + x, normalized_value);
+            }
+            /* Run remaining elements */
+            for(; x < input_width; ++x)
+            {
+                if(is_log)
+                {
+                    out_ptr[x] = tmp_ptr[x] - sum;
+                }
+                else
+                {
+                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                }
+            }
+        }
+    },
+    in_it, max_it, out_it);
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* SRC_CORE_NEON_KERNELS_SOFTMAX_LIST_H */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/impl.cpp b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
new file mode 100644
index 0000000000..4ed5a4fbea
--- /dev/null
+++ b/src/core/cpu/kernels/softmax/impl/sve/impl.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window)
+{
+    const auto all_true_pg    = wrapper::svptrue<ScalarType>();
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    Window win{ window };
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator input(in, win);
+    Iterator output(out, win);
+
+    execute_window_loop(win, [&](const Coordinates &)
+    {
+        // Get pointers
+        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+
+        // Init max value
+        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
+
+        int      x  = window_start_x;
+        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        do
+        {
+            const auto current_value = svld1(pg, in_ptr + x);
+            vec_max                  = svmax_m(pg, vec_max, current_value);
+
+            x += wrapper::svcnt<ScalarType>();
+            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+        }
+        while(svptest_any(all_true_pg, pg));
+
+        auto max_val = svmaxv(all_true_pg, vec_max);
+
+        *out_ptr = max_val;
+    },
+    input, output);
+}
+
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    Iterator in_it(in, window);
+    Iterator max_it(max, window);
+    Iterator out_it(out, window);
+
+    const auto all_true_pg = wrapper::svptrue<ScalarType>();
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
+
+        ScalarType sum{ 0 };
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+            const auto vec_max = wrapper::svdup_n(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+            /* Loop over row and compute exponentials and sum */
+            int      x  = 0;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            do
+            {
+                auto vec_elements = svld1(pg, in_ptr + x);
+                vec_elements      = svsub_z(pg, vec_elements, vec_max);
+                if(is_log)
+                {
+                    vec_elements = svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta)));
+                    vec_sum      = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                }
+                else
+                {
+                    vec_elements = wrapper::svexp_z(pg, svmul_z(pg, vec_elements, wrapper::svdup_n(static_cast<ScalarType>(beta))));
+                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                }
+                svst1(pg, tmp_ptr + x, vec_elements);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            }
+            while(svptest_any(all_true_pg, pg));
+
+            /* Reduce sum */
+            sum = svaddv(all_true_pg, vec_sum);
+
+            if(is_log)
+            {
+                sum = static_cast<ScalarType>(std::log(sum));
+            }
+            else
+            {
+                sum = ScalarType(1) / sum;
+            }
+        }
+
+        /* Normalize exponentials */
+        {
+            /* Loop over row and compute softmax */
+            int      x  = 0;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            do
+            {
+                auto vec_in           = svld1(pg, tmp_ptr + x);
+                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+                if(is_log)
+                {
+                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                }
+                else
+                {
+                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                }
+                svst1(pg, out_ptr + x, normalized_value);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+            }
+            while(svptest_any(all_true_pg, pg));
+        }
+    },
+    in_it, max_it, out_it);
+}
+
+template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
+template void sve_logits_1d_max<float16_t>(const ITensor *in, ITensor *out, const Window &window);
+template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
+template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
+
+template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
+                                                 ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
+                                                     ITensor *out, const float beta, bool is_log, const Window &window);
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(ENABLE_SVE) */
diff --git a/src/core/cpu/kernels/softmax/impl/sve/list.h b/src/core/cpu/kernels/softmax/impl/sve/list.h
new file mode 100644
index 0000000000..7ddb358b8e
--- /dev/null
+++ b/src/core/cpu/kernels/softmax/impl/sve/list.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
+#define SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H
+
+#if defined(ENABLE_SVE)
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include <arm_sve.h>
+
+namespace arm_compute
+{
+namespace cpu
+{
+template <typename ScalarType>
+void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
+
+template <typename ScalarType>
+void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
+                                 ITensor *out, const float beta, bool is_log, const Window &window);
+
+#if defined(__ARM_FEATURE_SVE2)
+template <typename ScalarType>
+void sve_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
+                                     ITensor *out, float beta, bool is_log, const Window &window)
+{
+    const int start_x     = in->info()->valid_region().anchor.x();
+    const int input_width = in->info()->valid_region().shape.x();
+
+    const float scale_beta     = -beta * in->info()->quantization_info().uniform().scale;
+    const auto  scale_beta_vec = svdup_n_f32(scale_beta);
+
+    Iterator   in_it(in, window);
+    Iterator   max_it(max, window);
+    Iterator   out_it(out, window);
+    const auto all_true_pg = wrapper::svptrue<ScalarType>();
+    using SVEType          = typename wrapper::traits::sve_vector<ScalarType>::type;
+
+    const int inc_1 = static_cast<int>(svcntw());
+    const int inc_2 = static_cast<int>(2 * svcntw());
+    const int inc_3 = static_cast<int>(3 * svcntw());
+
+    execute_window_loop(window, [&](const Coordinates &)
+    {
+        /* Get pointers */
+        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
+
+        float sum{};
+
+        /* Compute exponentials and sum */
+        {
+            /* Get max value */
+            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+            const auto vec_max = wrapper::svdup_n(max_val);
+
+            /* Init sum to zero */
+            auto vec_sum_0 = svdup_n_f32(0.f);
+            auto vec_sum_1 = svdup_n_f32(0.f);
+            auto vec_sum_2 = svdup_n_f32(0.f);
+            auto vec_sum_3 = svdup_n_f32(0.f);
+
+            /* Loop over row and compute exponentials and sum */
+            int      x    = 0;
+            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+            svbool_t pg_0 = svunpklo(svunpklo(pg));
+            svbool_t pg_1 = svunpkhi(svunpklo(pg));
+            svbool_t pg_2 = svunpklo(svunpkhi(pg));
+            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+            do
+            {
+                auto vec_elements = svld1(pg, in_ptr + x);
+                vec_elements      = svsub_z(pg, vec_max, vec_elements);
+
+                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements)));
+                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements)));
+                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements)));
+                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements)));
+
+                if(is_log)
+                {
+                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
+                }
+                else
+                {
+                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                }
+
+                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+                x += wrapper::svcnt<ScalarType>();
+                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                pg_0 = svunpklo(svunpklo(pg));
+                pg_1 = svunpkhi(svunpklo(pg));
+                pg_2 = svunpklo(svunpkhi(pg));
+                pg_3 = svunpkhi(svunpkhi(pg));
+            }
+            while(svptest_any(all_true_pg, pg));
+
+            /* Reduce sum */
+            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+            sum                = svaddv_f32(all_true_pg, vec_sum);
+
+            /* Run remaining elements */
+            x = 0;
+            if(is_log)
+            {
+                sum = std::log(sum);
+            }
+            else
+            {
+                sum = 256.f / sum;
+            }
+        }
+
+        /* Normalize exponentials */
+        {
+            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+            /* Loop over row and compute softmax */
+            int      x    = 0;
+            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+            svbool_t pg_0 = svunpklo(svunpklo(pg));
+            svbool_t pg_1 = svunpkhi(svunpklo(pg));
+            svbool_t pg_2 = svunpklo(svunpkhi(pg));
+            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+            do
+            {
+                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+                svfloat32_t res_0{};
+                svfloat32_t res_1{};
+                svfloat32_t res_2{};
+                svfloat32_t res_3{};
+
+                if(is_log)
+                {
+                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                }
+                else
+                {
+                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+                    if(is_qasymm8_signed)
+                    {
+                        const auto offset_vec = svdup_n_f32(128.f);
+                        res_0                 = svsub_z(pg_0, vec_in_0, offset_vec);
+                        res_1                 = svsub_z(pg_1, vec_in_1, offset_vec);
+                        res_2                 = svsub_z(pg_2, vec_in_2, offset_vec);
+                        res_3                 = svsub_z(pg_3, vec_in_3, offset_vec);
+                    }
+                }
+
+                // Store value
+                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+                svst1(pg, out_ptr + x, out);
+                x += wrapper::svcnt<ScalarType>();
+                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                pg_0 = svunpklo(svunpklo(pg));
+                pg_1 = svunpkhi(svunpklo(pg));
+                pg_2 = svunpklo(svunpkhi(pg));
+                pg_3 = svunpkhi(svunpkhi(pg));
+            }
+            while(svptest_any(all_true_pg, pg));
+        }
+    },
+    in_it, max_it, out_it);
+}
+#endif /* defined(__ARM_FEATURE_SVE2) */
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(ENABLE_SVE) */
+
+#endif /* SRC_CORE_SVE_KERNELS_SOFTMAX_LIST_H */
diff --git a/tests/validation/NEON/ActivationLayer.cpp b/tests/validation/NEON/ActivationLayer.cpp
index 577603d07d..111e969bae 100644
--- a/tests/validation/NEON/ActivationLayer.cpp
+++ b/tests/validation/NEON/ActivationLayer.cpp
@@ -68,11 +68,11 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.25f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.1f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.05f);
             }
@@ -80,11 +80,11 @@ RelativeTolerance<float> relative_tolerance(DataType data_type, ActivationLayerI
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.9f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return RelativeTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return RelativeTolerance<float>(0.00001f);
             }
@@ -111,11 +111,11 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.25f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
@@ -123,11 +123,11 @@ AbsoluteTolerance<float> absolute_tolerance(DataType data_type, ActivationLayerI
             switch(data_type)
             {
                 case DataType::F16:
-#if defined(__ARM_FEATURE_SVE)
+#if defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.9f);
-#else  // !defined(__ARM_FEATURE_SVE)
+#else  // !defined(ENABLE_SVE)
                     return AbsoluteTolerance<float>(0.01f);
-#endif // defined(__ARM_FEATURE_SVE)
+#endif // defined(ENABLE_SVE)
                 default:
                     return AbsoluteTolerance<float>(0.00001f);
             }
diff --git a/tests/validation/NEON/ArithmeticAddition.cpp b/tests/validation/NEON/ArithmeticAddition.cpp
index 98341805ed..ea6656eefe 100644
--- a/tests/validation/NEON/ArithmeticAddition.cpp
+++ b/tests/validation/NEON/ArithmeticAddition.cpp
@@ -43,11 +43,11 @@ namespace validation
 {
 namespace
 {
-#if !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#if !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(1); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
-#else                                                  // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#else                                                  // !defined(__aarch64__) || defined(ENABLE_SVE)
 constexpr AbsoluteTolerance<float> tolerance_quant(0);
-#endif                                                 // !defined(__aarch64__) || defined(__ARM_FEATURE_SVE)
+#endif                                                 // !defined(__aarch64__) || defined(ENABLE_SVE)
 
 /** Input data sets **/
 const auto ArithmeticAdditionU8Dataset = combine(combine(framework::dataset::make("DataType", DataType::U8), framework::dataset::make("DataType", DataType::U8)), framework::dataset::make("DataType",
-- 
cgit v1.2.1