From da816752cad76c8e1b367e8e9c648994a1af599a Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Fri, 2 Jul 2021 09:22:14 +0100
Subject: Remove redundant implementations of Add/Sub operators

Allows only implementations where inputs/output are of the same data
type and removes legacy Computer Vision ones.

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: Ia2b3d23a04236aab682f0c36a1110a30f7c06d1c
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5900
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/cpu/kernels/CpuAddKernel.cpp     | 141 ++++-----------------
 src/core/cpu/kernels/CpuAddKernel.h       |   3 -
 src/core/cpu/kernels/CpuSubKernel.cpp     |  87 +++----------
 src/core/cpu/kernels/CpuSubKernel.h       |   3 -
 src/core/cpu/kernels/add/neon/integer.cpp | 170 -------------------------
 src/core/cpu/kernels/add/neon/list.h      |   3 -
 src/core/cpu/kernels/add/sve/integer.cpp  | 201 ------------------------------
 src/core/cpu/kernels/add/sve/list.h       |   3 -
 src/core/cpu/kernels/sub/neon/integer.cpp | 183 ---------------------------
 src/core/cpu/kernels/sub/neon/list.h      |   3 -
 10 files changed, 44 insertions(+), 753 deletions(-)
 delete mode 100644 src/core/cpu/kernels/add/neon/integer.cpp
 delete mode 100644 src/core/cpu/kernels/add/sve/integer.cpp
 delete mode 100644 src/core/cpu/kernels/sub/neon/integer.cpp

(limited to 'src/core/cpu')
diff --git a/src/core/cpu/kernels/CpuAddKernel.cpp b/src/core/cpu/kernels/CpuAddKernel.cpp
index 12766037a7..61b7b19443 100644
--- a/src/core/cpu/kernels/CpuAddKernel.cpp
+++ b/src/core/cpu/kernels/CpuAddKernel.cpp
@@ -45,14 +45,7 @@ namespace
 {
 struct AddSelectorData
 {
-    /* Data types for all ITensorInfos:
-       dt1 -> src0
-       dt2 -> src1
-       dt3 -> dst
-    */
-    DataType       dt1;
-    DataType       dt2;
-    DataType       dt3;
+    DataType       dt;
     const CPUInfo &ci;
 };
 
@@ -72,7 +65,7 @@ static const AddKernel available_kernels[] =
         "sve2_qu8_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)) && data.ci.has_sve();
+            return (data.dt == DataType::QASYMM8) && data.ci.has_sve();
         },
         REGISTER_QASYMM8_SVE(arm_compute::cpu::add_qasymm8_sve)
     },
@@ -80,7 +73,7 @@ static const AddKernel available_kernels[] =
         "sve2_qs8_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)) && data.ci.has_sve();
+            return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve();
         },
         REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::add_qasymm8_signed_sve)
     },
@@ -88,7 +81,7 @@ static const AddKernel available_kernels[] =
         "sve2_qs16_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)) && data.ci.has_sve();
+            return (data.dt == DataType::QSYMM16) && data.ci.has_sve();
         },
         REGISTER_QSYMM16_SVE(arm_compute::cpu::add_qsymm16_sve)
     },
@@ -98,7 +91,7 @@ static const AddKernel available_kernels[] =
         "sve_fp32_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)) && data.ci.has_sve();
+            return (data.dt == DataType::F32) && data.ci.has_sve();
         },
         REGISTER_FP32_SVE(arm_compute::cpu::add_same_sve<float>)
     },
@@ -106,7 +99,7 @@ static const AddKernel available_kernels[] =
         "sve_fp16_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_sve();
+            return (data.dt == DataType::F16) && data.ci.has_sve();
         },
         REGISTER_FP16_SVE(arm_compute::cpu::add_same_sve<float16_t>)
     },
@@ -114,7 +107,7 @@ static const AddKernel available_kernels[] =
         "sve_u8_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)) && data.ci.has_sve();
+            return (data.dt == DataType::U8) && data.ci.has_sve();
         },
         REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<uint8_t>)
     },
@@ -122,7 +115,7 @@ static const AddKernel available_kernels[] =
         "sve_s16_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)) && data.ci.has_sve();
+            return (data.dt == DataType::S16) && data.ci.has_sve();
         },
         REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int16_t>)
     },
@@ -130,39 +123,15 @@ static const AddKernel available_kernels[] =
         "sve_s32_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)) && data.ci.has_sve();
+            return (data.dt == DataType::S32) && data.ci.has_sve();
         },
         REGISTER_INTEGER_SVE(arm_compute::cpu::add_same_sve<int32_t>)
     },
-    {
-        "sve_u8_s16_s16_add",
-        [](const AddSelectorData & data)
-        {
-            return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)) && data.ci.has_sve();
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_s16_s16_sve)
-    },
-    {
-        "sve_s16_u8_s16_add",
-        [](const AddSelectorData & data)
-        {
-            return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)) && data.ci.has_sve();
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_u8_s16_sve)
-    },
-    {
-        "sve_u8_u8_s16_add",
-        [](const AddSelectorData & data)
-        {
-            return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)) && data.ci.has_sve();
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_u8_s16_sve)
-    },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
     {
         "neon_fp32_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::F32); },
         REGISTER_FP32_NEON(arm_compute::cpu::add_same_neon<float>)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
@@ -170,56 +139,41 @@ static const AddKernel available_kernels[] =
         "neon_fp16_add",
         [](const AddSelectorData & data)
         {
-            return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)) && data.ci.has_fp16();
+            return (data.dt == DataType::F16) && data.ci.has_fp16();
         },
         REGISTER_FP16_NEON(arm_compute::cpu::add_same_neon<float16_t>)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_u8_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::U8); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<uint8_t>)
     },
     {
         "neon_s16_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::S16); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int16_t>)
     },
     {
         "neon_s32_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::S32); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::add_same_neon<int32_t>)
     },
-    {
-        "neon_u8_s16_s16_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_s16_s16_neon)
-    },
-    {
-        "neon_s16_u8_s16_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_u8_s16_neon)
-    },
-    {
-        "neon_u8_u8_s16_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_u8_s16_neon)
-    },
 #endif /*  defined(ARM_COMPUTE_ENABLE_NEON) */
 #if defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "neon_qu8_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
     },
     {
         "neon_qs8_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
     },
     {
         "neon_qs16_add",
-        [](const AddSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
+        [](const AddSelectorData & data) { return (data.dt == DataType::QSYMM16); },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) || defined(ARM_COMPUTE_ENABLE_SVE) */
@@ -231,11 +185,11 @@ static const AddKernel available_kernels[] =
  *
  * @return A matching micro-kernel else nullptr
  */
-const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt1, DataType dt2, DataType dt3)
+const AddKernel *get_implementation(const CPUInfo &cpuinfo, DataType dt)
 {
     for(const auto &uk : available_kernels)
     {
-        if(uk.is_selected({ dt1, dt2, dt3, cpuinfo }))
+        if(uk.is_selected({ dt, cpuinfo }))
         {
             return &uk;
         }
@@ -251,9 +205,7 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                          DataType::S16, DataType::QSYMM16, DataType::F16,
                                                          DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
 
@@ -265,25 +217,12 @@ Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, cons
     // Validate in case of configured dst
     if(dst.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
-            && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
-            && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16)
-            && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
-            && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
-            && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16),
-            "You called addition with the wrong image formats");
-
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
                                         "Wrong shape for dst");
     }
 
-    const auto *uk = get_implementation(CPUInfo::get(), src0.data_type(), src1.data_type(), dst.data_type());
+    const auto *uk = get_implementation(CPUInfo::get(), src0.data_type());
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -294,38 +233,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo &src0,
     const TensorShape &out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
 
     // Auto initialize dst if not initialized
-    {
-        set_shape_if_empty(dst, out_shape);
-
-        if(src0.data_type() == DataType::S16 || src1.data_type() == DataType::S16)
-        {
-            set_format_if_unknown(dst, Format::S16);
-        }
-        if(src0.data_type() == DataType::S32 || src1.data_type() == DataType::S32)
-        {
-            set_format_if_unknown(dst, Format::S32);
-        }
-        else if(src0.data_type() == DataType::F16 || src1.data_type() == DataType::F16)
-        {
-            set_format_if_unknown(dst, Format::F16);
-        }
-        else if(src0.data_type() == DataType::F32 || src1.data_type() == DataType::F32)
-        {
-            set_format_if_unknown(dst, Format::F32);
-        }
-        else if(src0.data_type() == DataType::QASYMM8 || src1.data_type() == DataType::QASYMM8)
-        {
-            set_data_type_if_unknown(dst, DataType::QASYMM8);
-        }
-        else if(src0.data_type() == DataType::QASYMM8_SIGNED || src1.data_type() == DataType::QASYMM8_SIGNED)
-        {
-            set_data_type_if_unknown(dst, DataType::QASYMM8_SIGNED);
-        }
-        else if(src0.data_type() == DataType::QSYMM16 || src1.data_type() == DataType::QSYMM16)
-        {
-            set_data_type_if_unknown(dst, DataType::QSYMM16);
-        }
-    }
+    set_shape_if_empty(dst, out_shape);
+    set_data_type_if_unknown(dst, src0.data_type());
 
     Window win = calculate_max_window(out_shape, Steps());
 
@@ -339,7 +248,7 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
 
-    const auto uk = get_implementation(CPUInfo::get(), src0->data_type(), src1->data_type(), dst->data_type());
+    const auto uk = get_implementation(CPUInfo::get(), src0->data_type());
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _policy     = policy;
diff --git a/src/core/cpu/kernels/CpuAddKernel.h b/src/core/cpu/kernels/CpuAddKernel.h
index 717d0132c6..1205b45dfb 100644
--- a/src/core/cpu/kernels/CpuAddKernel.h
+++ b/src/core/cpu/kernels/CpuAddKernel.h
@@ -44,9 +44,6 @@ public:
      * Valid configurations (src0,src1) -> dst :
      *
      *   - (U8,U8)           -> U8
-     *   - (U8,U8)           -> S16
-     *   - (S16,U8)          -> S16
-     *   - (U8,S16)          -> S16
      *   - (S16,S16)         -> S16
      *   - (S32,S32)         -> S32
      *   - (F16,F16)         -> F16
diff --git a/src/core/cpu/kernels/CpuSubKernel.cpp b/src/core/cpu/kernels/CpuSubKernel.cpp
index 098a324377..fa7a55805e 100644
--- a/src/core/cpu/kernels/CpuSubKernel.cpp
+++ b/src/core/cpu/kernels/CpuSubKernel.cpp
@@ -41,9 +41,7 @@ namespace
 {
 struct SubSelectorData
 {
-    DataType dt1;
-    DataType dt2;
-    DataType dt3;
+    DataType dt;
 };
 
 using SubSelectorPtr = std::add_pointer<bool(const SubSelectorData &data)>::type;
@@ -60,59 +58,44 @@ static const SubKernel available_kernels[] =
 {
     {
         "neon_fp32_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F32)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::F32); },
         REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::F16)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::F16); },
         REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_u8_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::U8)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::U8); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
     },
     {
         "neon_s16_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S16)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::S16); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
     },
     {
         "neon_s32_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == data.dt3) && (data.dt1 == DataType::S32)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::S32); },
         REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
     },
-    {
-        "neon_u8_s16_s16_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == DataType::U8) && (data.dt2 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_s16_s16_neon)
-    },
-    {
-        "neon_s16_u8_s16_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == DataType::S16) && (data.dt2 == DataType::U8)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_s16_u8_s16_neon)
-    },
-    {
-        "neon_u8_u8_s16_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt3 == DataType::S16)); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_u8_u8_s16_neon)
-    },
     {
         "neon_qu8_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
     },
     {
         "neon_qs8_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QASYMM8_SIGNED)); },
+        [](const SubSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
     },
     {
-        "neon_s16_sub",
-        [](const SubSelectorData & data) { return ((data.dt1 == data.dt2) && (data.dt1 == DataType::QSYMM16)); },
+        "neon_qs16_sub",
+        [](const SubSelectorData & data) { return (data.dt == DataType::QSYMM16); },
         REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
     },
 };
@@ -123,11 +106,11 @@ static const SubKernel available_kernels[] =
  *
  * @return A matching micro-kernel else nullptr
  */
-const SubKernel *get_implementation(DataType dt1, DataType dt2, DataType dt3)
+const SubKernel *get_implementation(DataType dt)
 {
     for(const auto &uk : available_kernels)
     {
-        if(uk.is_selected({ dt1, dt2, dt3 }))
+        if(uk.is_selected({ dt }))
         {
             return &uk;
         }
@@ -141,54 +124,21 @@ inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
                                                          DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
-    const auto *uk = get_implementation(src0.data_type(), src1.data_type(), dst.data_type());
+    const auto *uk = get_implementation(src0.data_type());
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8)
-        && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8)
-        && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED)
-        && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16)
-        && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8)
-        && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16)
-        && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8)
-        && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16)
-        && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32)
-        && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32)
-        && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16),
-        "You called subtract with the wrong image formats");
-
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP)
-        || (src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP)
-        || (src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP),
-        "Convert policy cannot be WRAP if datatype is quantized");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src0.data_type()) && (policy == ConvertPolicy::WRAP),
+                                    "Convert policy cannot be WRAP if datatype is quantized");
 
     // Validate in case of configured dst
     if(dst.total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::U8)
-            && !(src0.data_type() == DataType::QASYMM8 && src1.data_type() == DataType::QASYMM8 && dst.data_type() == DataType::QASYMM8)
-            && !(src0.data_type() == DataType::QASYMM8_SIGNED && src1.data_type() == DataType::QASYMM8_SIGNED && dst.data_type() == DataType::QASYMM8_SIGNED)
-            && !(src0.data_type() == DataType::QSYMM16 && src1.data_type() == DataType::QSYMM16 && dst.data_type() == DataType::QSYMM16)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::U8 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::U8 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S16 && src1.data_type() == DataType::S16 && dst.data_type() == DataType::S16)
-            && !(src0.data_type() == DataType::S32 && src1.data_type() == DataType::S32 && dst.data_type() == DataType::S32)
-            && !(src0.data_type() == DataType::F32 && src1.data_type() == DataType::F32 && dst.data_type() == DataType::F32)
-            && !(src0.data_type() == DataType::F16 && src1.data_type() == DataType::F16 && dst.data_type() == DataType::F16),
-            "You called subtract with the wrong image formats");
-
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
                                         "Wrong shape for dst");
     }
@@ -205,8 +155,9 @@ void CpuSubKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
 
     // Auto initialize dst if not initialized
     set_shape_if_empty(*dst, out_shape);
+    set_data_type_if_unknown(*dst, src0->data_type());
 
-    const auto *uk = get_implementation(src0->data_type(), src1->data_type(), dst->data_type());
+    const auto *uk = get_implementation(src0->data_type());
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _policy     = policy;
diff --git a/src/core/cpu/kernels/CpuSubKernel.h b/src/core/cpu/kernels/CpuSubKernel.h
index b9160bd150..cb64e64cfa 100644
--- a/src/core/cpu/kernels/CpuSubKernel.h
+++ b/src/core/cpu/kernels/CpuSubKernel.h
@@ -45,11 +45,8 @@ public:
      * Valid configurations (src0,src1) -> dst :
      *
      *   - (U8,U8)                          -> U8
-     *   - (U8,U8)                          -> S16
      *   - (QASYMM8, QASYMM8)               -> QASYMM8
      *   - (QASYMM8_SIGNED, QASYMM8_SIGNED) -> QASYMM8_SIGNED
-     *   - (S16,U8)                         -> S16
-     *   - (U8,S16)                         -> S16
      *   - (S16,S16)                        -> S16
      *   - (S32,S32)                        -> S32
      *   - (F16,F16)                        -> F16
diff --git a/src/core/cpu/kernels/add/neon/integer.cpp b/src/core/cpu/kernels/add/neon/integer.cpp
deleted file mode 100644
index 24a0ac3b7c..0000000000
--- a/src/core/cpu/kernels/add/neon/integer.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) + static_cast<int16_t>(*(input2_ptr + x));
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = wrapper::add_sat(static_cast<int16_t>(*(input1_ptr + x)),
-                                                     static_cast<int16_t>(*(input2_ptr + x)));
-            }
-        }
-    },
-    input1, input2, output);
-}
-
-void add_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = *(input1_ptr + x) + static_cast<int16_t>(*(input2_ptr + x));
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vqadd(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = wrapper::add_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
-            }
-        }
-    },
-    input1, input2, output);
-}
-
-void add_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Simply swap the two input buffers:
-    add_s16_u8_s16_neon(src1, src0, dst, policy, window);
-}
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/neon/list.h b/src/core/cpu/kernels/add/neon/list.h
index 3ab03dd40e..379bd32fb1 100644
--- a/src/core/cpu/kernels/add/neon/list.h
+++ b/src/core/cpu/kernels/add/neon/list.h
@@ -38,9 +38,6 @@ namespace cpu
 DECLARE_ADD_KERNEL(add_qasymm8_neon);
 DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
 DECLARE_ADD_KERNEL(add_qsymm16_neon);
-DECLARE_ADD_KERNEL(add_s16_u8_s16_neon);
-DECLARE_ADD_KERNEL(add_u8_s16_s16_neon);
-DECLARE_ADD_KERNEL(add_u8_u8_s16_neon);
 
 #undef DECLARE_ADD_KERNEL
 
diff --git a/src/core/cpu/kernels/add/sve/integer.cpp b/src/core/cpu/kernels/add/sve/integer.cpp
deleted file mode 100644
index bd8179205b..0000000000
--- a/src/core/cpu/kernels/add/sve/integer.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(__ARM_FEATURE_SVE)
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/SVEMath.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
-#include <arm_sve.h>
-
-namespace arm_compute
-{
-namespace cpu
-{
-void add_u8_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const auto all_true_pg    = svptrue_b8();
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            do
-            {
-                const auto vsrc0 = svld1(pg_u, input1_ptr + x);
-                const auto vsrc1 = svld1(pg_u, input2_ptr + x);
-
-                const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0));
-                const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0));
-                const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1));
-                const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1));
-                svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo));
-                svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-        else
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            do
-            {
-                const auto vsrc0 = svld1(pg_u, input1_ptr + x);
-                const auto vsrc1 = svld1(pg_u, input2_ptr + x);
-
-                const auto vsrc0_lo = svreinterpret_s16_u16(svunpklo(vsrc0));
-                const auto vsrc0_hi = svreinterpret_s16_u16(svunpkhi(vsrc0));
-                const auto vsrc1_lo = svreinterpret_s16_u16(svunpklo(vsrc1));
-                const auto vsrc1_hi = svreinterpret_s16_u16(svunpkhi(vsrc1));
-                svst1(pg_0, output_ptr + x, svqadd(vsrc0_lo, vsrc1_lo));
-                svst1(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_hi, vsrc1_hi));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x, static_cast<int>(window_end_x + svcnth()));
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-    },
-    input1, input2, output);
-}
-
-void add_s16_u8_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-    const auto all_true_pg    = svptrue_b8();
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            do
-            {
-                const auto vsrc0_0  = svld1_s16(pg_0, input1_ptr + x);
-                const auto vsrc0_1  = svld1_s16(pg_1, input1_ptr + x + svcnth());
-                const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
-                const auto vsrc1_0  = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
-                const auto vsrc1_1  = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
-                svst1_s16(pg_0, output_ptr + x, svadd_s16_z(pg_0, vsrc0_0, vsrc1_0));
-                svst1_s16(pg_1, output_ptr + x + svcnth(), svadd_s16_z(pg_1, vsrc0_1, vsrc1_1));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-        else
-        {
-            int      x    = window_start_x;
-            svbool_t pg_u = svwhilelt_b8(x, window_end_x);
-            svbool_t pg_0 = svwhilelt_b16(x, window_end_x);
-            svbool_t pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            do
-            {
-                const auto vsrc0_0  = svld1_s16(pg_0, input1_ptr + x);
-                const auto vsrc0_1  = svld1_s16(pg_1, input1_ptr + x + svcnth());
-                const auto vsrc1_u8 = svld1_u8(pg_u, input2_ptr + x);
-                const auto vsrc1_0  = svreinterpret_s16_u16(svunpklo(vsrc1_u8));
-                const auto vsrc1_1  = svreinterpret_s16_u16(svunpkhi(vsrc1_u8));
-
-                svst1_s16(pg_0, output_ptr + x, svqadd(vsrc0_0, vsrc1_0));
-                svst1_s16(pg_1, output_ptr + x + svcnth(), svqadd(vsrc0_1, vsrc1_1));
-
-                x += svcntb();
-                pg_u = svwhilelt_b8(x, window_end_x);
-                pg_0 = svwhilelt_b16(x, window_end_x);
-                pg_1 = svwhilelt_b16(x + static_cast<int>(svcnth()), window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg_u));
-        }
-    },
-    input1, input2, output);
-}
-
-void add_u8_s16_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Simply swap the two input buffers:
-    add_s16_u8_s16_sve(src1, src0, dst, policy, window);
-}
-} // namespace cpu
-} // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_SVE) */
\ No newline at end of file
diff --git a/src/core/cpu/kernels/add/sve/list.h b/src/core/cpu/kernels/add/sve/list.h
index 9e439497c9..4d29c2a8f1 100644
--- a/src/core/cpu/kernels/add/sve/list.h
+++ b/src/core/cpu/kernels/add/sve/list.h
@@ -42,9 +42,6 @@ namespace cpu
 DECLARE_ADD_KERNEL(add_qasymm8_sve);
 DECLARE_ADD_KERNEL(add_qasymm8_signed_sve);
 DECLARE_ADD_KERNEL(add_qsymm16_sve);
-DECLARE_ADD_KERNEL(add_s16_u8_s16_sve);
-DECLARE_ADD_KERNEL(add_u8_s16_s16_sve);
-DECLARE_ADD_KERNEL(add_u8_u8_s16_sve);
 
 #undef DECLARE_ADD_KERNEL
 
diff --git a/src/core/cpu/kernels/sub/neon/integer.cpp b/src/core/cpu/kernels/sub/neon/integer.cpp
deleted file mode 100644
index bba73df1e8..0000000000
--- a/src/core/cpu/kernels/sub/neon/integer.cpp
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace cpu
-{
-namespace
-{
-void sub_s16_u8_s16_impl(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_swapped)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                const auto res  = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto res    = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));
-                *(output_ptr + x) = res;
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = wrapper::vloadq(input1_ptr + x);
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                const auto res  = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);
-                wrapper::vstore(output_ptr + x, res);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto res    = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));
-                *(output_ptr + x) = res;
-            }
-        }
-    },
-    input1, input2, output);
-}
-}
-
-void sub_s16_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    sub_s16_u8_s16_impl(src1, src0, dst, policy, window, false);
-}
-
-void sub_u8_s16_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Swap arguments
-    sub_s16_u8_s16_impl(src1, src0, dst, policy, window, true);
-}
-
-void sub_u8_u8_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
-{
-    // Create input windows
-    Window win        = window;
-    Window input1_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
-    Window input2_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
-
-    // Clear X Dimension on execution window as we handle manually
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-    input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input1(src0, input1_win);
-    Iterator input2(src1, input2_win);
-    Iterator output(dst, win);
-
-    const int  window_step_x  = 8;
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        if(policy == ConvertPolicy::WRAP)
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));
-            }
-        }
-        else
-        {
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));
-                const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));
-                wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),
-                                                     static_cast<int16_t>(*(input2_ptr + x)));
-            }
-        }
-    },
-    input1, input2, output);
-}
-
-} // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/sub/neon/list.h b/src/core/cpu/kernels/sub/neon/list.h
index 1ab4e6367b..ac1346001a 100644
--- a/src/core/cpu/kernels/sub/neon/list.h
+++ b/src/core/cpu/kernels/sub/neon/list.h
@@ -38,9 +38,6 @@ namespace cpu
 DECLARE_SUB_KERNEL(sub_qasymm8_neon);
 DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon);
 DECLARE_SUB_KERNEL(sub_qsymm16_neon);
-DECLARE_SUB_KERNEL(sub_s16_u8_s16_neon);
-DECLARE_SUB_KERNEL(sub_u8_s16_s16_neon);
-DECLARE_SUB_KERNEL(sub_u8_u8_s16_neon);
 
 #undef DECLARE_SUB_KERNEL
 
-- 
cgit v1.2.1