From 27e67f0b2047cfa2f011f9e242e3068d9e106b39 Mon Sep 17 00:00:00 2001
From: Michalis Spyrou <michalis.spyrou@arm.com>
Date: Tue, 16 Feb 2021 11:34:39 +0000
Subject: Remove Compute Vision Neon support

Resolves COMPMID-4150

Change-Id: I316e8ab97de796666c71eadfde894715fcf4a1aa
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5141
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/NEON/NEKernels.h                          |   33 -
 .../NEON/kernels/NEAbsoluteDifferenceKernel.cpp    |  209 ----
 src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h |   86 --
 src/core/NEON/kernels/NEAccumulateKernel.cpp       |  359 -------
 src/core/NEON/kernels/NEAccumulateKernel.h         |  183 ----
 src/core/NEON/kernels/NEBox3x3Kernel.cpp           |  194 ----
 src/core/NEON/kernels/NEBox3x3Kernel.h             |   95 --
 src/core/NEON/kernels/NECannyEdgeKernel.cpp        | 1122 --------------------
 src/core/NEON/kernels/NECannyEdgeKernel.h          |  189 ----
 src/core/NEON/kernels/NEChannelCombineKernel.cpp   |  456 --------
 src/core/NEON/kernels/NEChannelCombineKernel.h     |  129 ---
 src/core/NEON/kernels/NEChannelExtractKernel.cpp   |  269 -----
 src/core/NEON/kernels/NEChannelExtractKernel.h     |  111 --
 src/core/NEON/kernels/NEColorConvertKernel.cpp     |  590 ----------
 src/core/NEON/kernels/NEColorConvertKernel.h       |   93 --
 src/core/NEON/kernels/NEDerivativeKernel.cpp       |  231 ----
 src/core/NEON/kernels/NEDerivativeKernel.h         |  100 --
 src/core/NEON/kernels/NEDilateKernel.cpp           |  128 ---
 src/core/NEON/kernels/NEDilateKernel.h             |   65 --
 src/core/NEON/kernels/NEErodeKernel.cpp            |  128 ---
 src/core/NEON/kernels/NEErodeKernel.h              |   65 --
 src/core/NEON/kernels/NEFastCornersKernel.cpp      |  475 ---------
 src/core/NEON/kernels/NEFastCornersKernel.h        |   78 --
 src/core/NEON/kernels/NEGaussian3x3Kernel.cpp      |  135 ---
 src/core/NEON/kernels/NEGaussian3x3Kernel.h        |   66 --
 src/core/NEON/kernels/NEGaussian5x5Kernel.cpp      |  211 ----
 src/core/NEON/kernels/NEGaussian5x5Kernel.h        |  103 --
 src/core/NEON/kernels/NEGaussianPyramidKernel.cpp  |  272 -----
 src/core/NEON/kernels/NEGaussianPyramidKernel.h    |  105 --
 src/core/NEON/kernels/NEHOGDescriptorKernel.cpp    |  806 --------------
 src/core/NEON/kernels/NEHOGDescriptorKernel.h      |  149 ---
 src/core/NEON/kernels/NEHOGDetectorKernel.cpp      |  189 ----
 src/core/NEON/kernels/NEHOGDetectorKernel.h        |   89 --
 src/core/NEON/kernels/NEHarrisCornersKernel.cpp    |  817 --------------
 src/core/NEON/kernels/NEHarrisCornersKernel.h      |  105 --
 src/core/NEON/kernels/NEHistogramKernel.cpp        |  249 -----
 src/core/NEON/kernels/NEHistogramKernel.h          |  135 ---
 src/core/NEON/kernels/NEIntegralImageKernel.cpp    |  144 ---
 src/core/NEON/kernels/NEIntegralImageKernel.h      |   66 --
 src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp   |  490 ---------
 src/core/NEON/kernels/NEMagnitudePhaseKernel.h     |  101 --
 src/core/NEON/kernels/NEMeanStdDevKernel.cpp       |  162 ---
 src/core/NEON/kernels/NEMeanStdDevKernel.h         |   83 --
 src/core/NEON/kernels/NEMedian3x3Kernel.cpp        |  137 ---
 src/core/NEON/kernels/NEMedian3x3Kernel.h          |   66 --
 src/core/NEON/kernels/NEMinMaxLocationKernel.cpp   |  478 ---------
 src/core/NEON/kernels/NEMinMaxLocationKernel.h     |  171 ---
 src/core/NEON/kernels/NENonLinearFilterKernel.cpp  | 1018 ------------------
 src/core/NEON/kernels/NENonLinearFilterKernel.h    |  153 ---
 .../NEON/kernels/NENonMaximaSuppression3x3Kernel.h |    1 -
 src/core/NEON/kernels/NERemapKernel.cpp            |  237 -----
 src/core/NEON/kernels/NERemapKernel.h              |   83 --
 src/core/NEON/kernels/NEScharr3x3Kernel.cpp        |  262 -----
 src/core/NEON/kernels/NEScharr3x3Kernel.h          |   86 --
 src/core/NEON/kernels/NESobel3x3Kernel.cpp         |  272 -----
 src/core/NEON/kernels/NESobel3x3Kernel.h           |   86 --
 src/core/NEON/kernels/NESobel5x5Kernel.cpp         |  406 -------
 src/core/NEON/kernels/NESobel5x5Kernel.h           |  126 ---
 src/core/NEON/kernels/NESobel7x7Kernel.cpp         |  524 ---------
 src/core/NEON/kernels/NESobel7x7Kernel.h           |  130 ---
 src/core/NEON/kernels/NETableLookupKernel.cpp      |  143 ---
 src/core/NEON/kernels/NETableLookupKernel.h        |   82 --
 src/core/NEON/kernels/NEThresholdKernel.cpp        |  217 ----
 src/core/NEON/kernels/NEThresholdKernel.h          |   88 --
 src/core/NEON/kernels/NEWarpKernel.cpp             |  807 --------------
 src/core/NEON/kernels/NEWarpKernel.h               |  131 ---
 66 files changed, 15569 deletions(-)
 delete mode 100644 src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
 delete mode 100644 src/core/NEON/kernels/NEAccumulateKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEAccumulateKernel.h
 delete mode 100644 src/core/NEON/kernels/NEBox3x3Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEBox3x3Kernel.h
 delete mode 100644 src/core/NEON/kernels/NECannyEdgeKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NECannyEdgeKernel.h
 delete mode 100644 src/core/NEON/kernels/NEChannelCombineKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEChannelCombineKernel.h
 delete mode 100644 src/core/NEON/kernels/NEChannelExtractKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEChannelExtractKernel.h
 delete mode 100644 src/core/NEON/kernels/NEColorConvertKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEColorConvertKernel.h
 delete mode 100644 src/core/NEON/kernels/NEDerivativeKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEDerivativeKernel.h
 delete mode 100644 src/core/NEON/kernels/NEDilateKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEDilateKernel.h
 delete mode 100644 src/core/NEON/kernels/NEErodeKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEErodeKernel.h
 delete mode 100644 src/core/NEON/kernels/NEFastCornersKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEFastCornersKernel.h
 delete mode 100644 src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEGaussian3x3Kernel.h
 delete mode 100644 src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEGaussian5x5Kernel.h
 delete mode 100644 src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEGaussianPyramidKernel.h
 delete mode 100644 src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEHOGDescriptorKernel.h
 delete mode 100644 src/core/NEON/kernels/NEHOGDetectorKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEHOGDetectorKernel.h
 delete mode 100644 src/core/NEON/kernels/NEHarrisCornersKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEHarrisCornersKernel.h
 delete mode 100644 src/core/NEON/kernels/NEHistogramKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEHistogramKernel.h
 delete mode 100644 src/core/NEON/kernels/NEIntegralImageKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEIntegralImageKernel.h
 delete mode 100644 src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEMagnitudePhaseKernel.h
 delete mode 100644 src/core/NEON/kernels/NEMeanStdDevKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEMeanStdDevKernel.h
 delete mode 100644 src/core/NEON/kernels/NEMedian3x3Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEMedian3x3Kernel.h
 delete mode 100644 src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEMinMaxLocationKernel.h
 delete mode 100644 src/core/NEON/kernels/NENonLinearFilterKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NENonLinearFilterKernel.h
 delete mode 100644 src/core/NEON/kernels/NERemapKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NERemapKernel.h
 delete mode 100644 src/core/NEON/kernels/NEScharr3x3Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEScharr3x3Kernel.h
 delete mode 100644 src/core/NEON/kernels/NESobel3x3Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NESobel3x3Kernel.h
 delete mode 100644 src/core/NEON/kernels/NESobel5x5Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NESobel5x5Kernel.h
 delete mode 100644 src/core/NEON/kernels/NESobel7x7Kernel.cpp
 delete mode 100644 src/core/NEON/kernels/NESobel7x7Kernel.h
 delete mode 100644 src/core/NEON/kernels/NETableLookupKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NETableLookupKernel.h
 delete mode 100644 src/core/NEON/kernels/NEThresholdKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEThresholdKernel.h
 delete mode 100644 src/core/NEON/kernels/NEWarpKernel.cpp
 delete mode 100644 src/core/NEON/kernels/NEWarpKernel.h

(limited to 'src/core/NEON')

diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index b962c9eeee..aea245c6fb 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -24,9 +24,6 @@
 #ifndef ARM_COMPUTE_NEKERNELS_H
 #define ARM_COMPUTE_NEKERNELS_H
 
-/* Header regrouping all the Neon kernels */
-#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
-#include "src/core/NEON/kernels/NEAccumulateKernel.h"
 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 #include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
@@ -34,13 +31,8 @@
 #include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 #include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
-#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
-#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
-#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
-#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
 #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 #include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEColorConvertKernel.h"
 #include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
 #include "src/core/NEON/kernels/NEConvolutionKernel.h"
@@ -50,15 +42,11 @@
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 #include "src/core/NEON/kernels/NEDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/core/NEON/kernels/NEDequantizationLayerKernel.h"
-#include "src/core/NEON/kernels/NEDerivativeKernel.h"
-#include "src/core/NEON/kernels/NEDilateKernel.h"
 #include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
 #include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEErodeKernel.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "src/core/NEON/kernels/NEFastCornersKernel.h"
 #include "src/core/NEON/kernels/NEFillArrayKernel.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
@@ -75,28 +63,15 @@
 #include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "src/core/NEON/kernels/NEGatherKernel.h"
-#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
-#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
-#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
-#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
-#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
-#include "src/core/NEON/kernels/NEHistogramKernel.h"
 #include "src/core/NEON/kernels/NEIm2ColKernel.h"
 #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
-#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
 #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "src/core/NEON/kernels/NELKTrackerKernel.h"
 #include "src/core/NEON/kernels/NELogicalKernel.h"
-#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
 #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
-#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
 #include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
 #include "src/core/NEON/kernels/NEMinMaxLayerKernel.h"
-#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
-#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
 #include "src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
@@ -108,24 +83,16 @@
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NERemapKernel.h"
 #include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 #include "src/core/NEON/kernels/NEReverseKernel.h"
 #include "src/core/NEON/kernels/NEScaleKernel.h"
-#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
-#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
-#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
-#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 #include "src/core/NEON/kernels/NEStackLayerKernel.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
-#include "src/core/NEON/kernels/NETableLookupKernel.h"
-#include "src/core/NEON/kernels/NEThresholdKernel.h"
 #include "src/core/NEON/kernels/NETileKernel.h"
 #include "src/core/NEON/kernels/NETransposeKernel.h"
-#include "src/core/NEON/kernels/NEWarpKernel.h"
 #include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.h"
 
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
deleted file mode 100644
index a6a41b8af9..0000000000
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-void abs_diff_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t input1_val = vld1q_u8(input1.ptr());
-        const uint8x16_t input2_val = vld1q_u8(input2.ptr());
-
-        vst1q_u8(output.ptr(), vabdq_u8(input1_val, input2_val));
-    },
-    input1, input2, output);
-}
-
-inline int16x8x2_t vqabd2q_s16(const int16x8x2_t &v1, const int16x8x2_t &v2)
-{
-    const int16x8x2_t res =
-    {
-        {
-            vqabsq_s16(vqsubq_s16(v1.val[0], v2.val[0])),
-            vqabsq_s16(vqsubq_s16(v1.val[1], v2.val[1]))
-        }
-    };
-
-    return res;
-}
-
-void abs_diff_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        int16x8x2_t input1_val = vld2q_s16(reinterpret_cast<const int16_t *>(input1.ptr()));
-        int16x8x2_t input2_val = vld2q_s16(reinterpret_cast<const int16_t *>(input2.ptr()));
-        vst2q_s16(reinterpret_cast<int16_t *>(output.ptr()), vqabd2q_s16(input1_val, input2_val));
-    },
-    input1, input2, output);
-}
-
-void abs_diff_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    Iterator input1(in1, window);
-    Iterator input2(in2, window);
-    Iterator output(out, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t  input1_val = vld1q_u8(input1.ptr());
-        const int16x8x2_t input2_val =
-        {
-            {
-                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr())),
-                vld1q_s16(reinterpret_cast<const int16_t *>(input2.ptr()) + 8)
-            }
-        };
-
-        const int16x8x2_t out_val =
-        {
-            {
-                vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input1_val))), input2_val.val[0])),
-                vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input1_val))), input2_val.val[1]))
-            }
-        };
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out_val.val[0]);
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()) + 8, out_val.val[1]);
-
-    },
-    input1, input2, output);
-}
-
-void abs_diff_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
-{
-    abs_diff_U8_S16_S16(in2, in1, out, window);
-}
-} // namespace
-
-NEAbsoluteDifferenceKernel::NEAbsoluteDifferenceKernel()
-    : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-
-    set_shape_if_empty(*output->info(), input1->info()->tensor_shape());
-
-    if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16)
-    {
-        set_format_if_unknown(*output->info(), Format::S16);
-    }
-    else if(input1->info()->data_type() == DataType::U8 || input2->info()->data_type() == DataType::U8)
-    {
-        set_format_if_unknown(*output->info(), Format::U8);
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8),
-                             "The output image can only be U8 if both input images are U8");
-
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
-    const DataType input1_data_type = input1->info()->data_type();
-    const DataType input2_data_type = input2->info()->data_type();
-
-    if(input1_data_type == input2_data_type)
-    {
-        if(input1_data_type == DataType::U8)
-        {
-            _func = &abs_diff_U8_U8_U8;
-        }
-        else
-        {
-            _func = &abs_diff_S16_S16_S16;
-        }
-    }
-    else
-    {
-        if(input1_data_type == DataType::U8)
-        {
-            _func = &abs_diff_U8_S16_S16;
-        }
-        else
-        {
-            _func = &abs_diff_S16_U8_S16;
-        }
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region);
-
-    INEKernel::configure(win);
-}
-
-void NEAbsoluteDifferenceKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    _func(_input1, _input2, _output, window);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
deleted file mode 100644
index cc95172f35..0000000000
--- a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
-#define ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the absolute difference kernel
- *
- * Absolute difference is computed by:
- * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f]
- */
-class NEAbsoluteDifferenceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAbsoluteDifferenceKernel";
-    }
-    /** Default constructor */
-    NEAbsoluteDifferenceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default;
-    /** Default destructor */
-    ~NEAbsoluteDifferenceKernel() = default;
-
-    /** Set the inputs and output tensors
-     *
-     * @param[in]  input1 Source tensor. Data types supported: U8/S16
-     * @param[in]  input2 Source tensor. Data types supported: U8/S16
-     * @param[out] output Destination tensor, Data types supported: U8/S16
-     */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised absolute difference functions
-     *
-     * @param[in]  input1 An input tensor. Data types supported: U8/S16.
-     * @param[in]  input2 An input tensor. Data types supported: U8/S16.
-     * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16.
-     * @param[in]  window Region on which to execute the kernel.
-     */
-    using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
-
-    /** Absolute difference function to use for the particular tensor formats passed to configure() */
-    AbsDiffFunction *_func;
-    const ITensor   *_input1;
-    const ITensor   *_input2;
-    ITensor         *_output;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp
deleted file mode 100644
index 46179cadcb..0000000000
--- a/src/core/NEON/kernels/NEAccumulateKernel.cpp
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEAccumulateKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-/* Max S16 value used for saturation purposes. */
-const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast<uint16_t>(INT16_MAX));
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-namespace fp16
-{
-inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input)
-{
-    const float16x8x2_t out =
-    {
-        {
-            vcvtq_f16_u16(vmovl_u8(vget_low_u8(input))),
-            vcvtq_f16_u16(vmovl_u8(vget_high_u8(input)))
-        }
-    };
-
-    return out;
-}
-
-inline uint8x16_t convert_f16x8x2_to_u8x16(const float16x8x2_t &input)
-{
-    return vcombine_u8(vmovn_u16(vcvtq_u16_f16(input.val[0])),
-                       vmovn_u16(vcvtq_u16_f16(input.val[1])));
-}
-
-inline float16x8x2_t vector_accumulate_weighted(const float16x8x2_t &vec0, const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2)
-{
-    const float16x8x2_t res =
-    {
-        {
-            vfmaq_f16(vmulq_f16(vec1.val[0], scale_val), vec0.val[0], scale_val2),
-            vfmaq_f16(vmulq_f16(vec1.val[1], scale_val), vec0.val[1], scale_val2)
-        }
-    };
-
-    return res;
-}
-
-void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, float16x8_t scale_val, float16x8_t scale_val2)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == accum);
-
-    const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
-    const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
-
-    const uint8x16x4_t input_buffer = vld4q_u8(input_ptr);
-    uint8x16x4_t       accum_buffer = vld4q_u8(accum_ptr);
-
-    const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]);
-    const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]);
-    const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]);
-    const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]);
-
-    float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]);
-    float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]);
-    float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]);
-    float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]);
-
-    f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2);
-    f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2);
-    f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2);
-    f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2);
-
-    accum_buffer = { {
-            convert_f16x8x2_to_u8x16(f16_accum_0),
-            convert_f16x8x2_to_u8x16(f16_accum_1),
-            convert_f16x8x2_to_u8x16(f16_accum_2),
-            convert_f16x8x2_to_u8x16(f16_accum_3)
-        }
-    };
-
-    vst4q_u8(accum_ptr, accum_buffer);
-}
-} // namespace fp16
-
-void NEAccumulateWeightedFP16Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator accum(_output, window);
-
-    const float16x8_t scale_val  = vdupq_n_f16(1.f - _alpha);
-    const float16x8_t scale_val2 = vdupq_n_f16(_alpha);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
-    },
-    input, accum);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-namespace
-{
-inline void acc_v16_u8(const void *__restrict input, void *__restrict accum)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == accum);
-
-    const auto in  = static_cast<const uint8_t *__restrict>(input);
-    const auto out = static_cast<int16_t *__restrict>(accum);
-
-    uint8x16_t ta1 = vld1q_u8(in);
-    int16x8_t  ta2 = vld1q_s16(out);
-    int16x8_t  ta3 = vld1q_s16(out + 8);
-
-    ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1))));
-    ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1))));
-
-    vst1q_s16(out, ta2);
-    vst1q_s16(out + 8, ta3);
-}
-
-inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input)
-{
-    const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input));
-    const uint16x8_t u16_output_hi  = vmovl_u8(vget_high_u8(input));
-
-    const float32x4x4_t res =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi)))
-        }
-    };
-
-    return res;
-}
-
-inline uint8x16_t convert_f32x4x4_to_u8x16(const float32x4x4_t &input)
-{
-    return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])),
-                                              vmovn_u32(vcvtq_u32_f32(input.val[1])))),
-                       vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])),
-                                              vmovn_u32(vcvtq_u32_f32(input.val[3])))));
-}
-
-inline float32x4x4_t vector_accumulate_weighted(const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2)
-{
-    vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val);
-    vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val);
-    vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val);
-    vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val);
-
-    vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2);
-    vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2);
-    vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2);
-    vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2);
-
-    return vector_output;
-}
-
-inline void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, const float32x4_t scale_val, const float32x4_t scale_val2)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == accum);
-
-    const auto input_ptr = static_cast<const uint8_t *__restrict>(input);
-    const auto accum_ptr = static_cast<uint8_t *__restrict>(accum);
-
-    const uint8x16_t input_buffer = vld1q_u8(input_ptr);
-    const uint8x16_t accum_buffer = vld1q_u8(accum_ptr);
-
-    const float32x4x4_t f32_input_0  = convert_u8x16_to_f32x4x4(input_buffer);
-    const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer);
-
-    const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2);
-
-    vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0));
-}
-
-void acc_sq_v16_u8(const void *__restrict input, uint32_t shift, void *__restrict accum)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input);
-    ARM_COMPUTE_ERROR_ON(nullptr == accum);
-    ARM_COMPUTE_ERROR_ON(shift > 15);
-
-    const auto input_buffer = static_cast<const uint8_t *__restrict>(input);
-    const auto accum_buffer = static_cast<int16_t *__restrict>(accum);
-
-    const uint8x16_t ta1 = vld1q_u8(input_buffer);
-    uint16x8_t       ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer));
-    uint16x8_t       ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8));
-
-    const int16x8_t vector_shift = vdupq_n_s16(-static_cast<int16_t>(shift));
-
-    uint16x8_t linput = vmovl_u8(vget_low_u8(ta1));
-    uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1));
-
-    linput = vmulq_u16(linput, linput);
-    hinput = vmulq_u16(hinput, hinput);
-
-    linput = vqshlq_u16(linput, vector_shift);
-    hinput = vqshlq_u16(hinput, vector_shift);
-
-    ta2 = vqaddq_u16(ta2, linput);
-    ta3 = vqaddq_u16(ta3, hinput);
-
-    vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2)));
-    vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3)));
-}
-} // namespace
-
-void NEAccumulateKernel::configure(const ITensor *input, ITensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
-
-    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
-
-    set_format_if_unknown(*accum->info(), Format::S16);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void NEAccumulateKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-    Iterator input(_input, window);
-    Iterator accum(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        acc_v16_u8(input.ptr(), accum.ptr());
-    },
-    input, accum);
-}
-
-NEAccumulateWeightedKernel::NEAccumulateWeightedKernel()
-    : _alpha(0.0f)
-{
-}
-
-void NEAccumulateWeightedKernel::configure(const ITensor *input, float alpha, ITensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
-
-    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
-
-    set_format_if_unknown(*accum->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0);
-
-    _alpha = alpha;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void NEAccumulateWeightedKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator accum(_output, window);
-
-    const float32x4_t scale_val  = vdupq_n_f32(1.f - _alpha);
-    const float32x4_t scale_val2 = vdupq_n_f32(_alpha);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2);
-    },
-    input, accum);
-}
-
-NEAccumulateSquaredKernel::NEAccumulateSquaredKernel()
-    : _shift(0)
-{
-}
-
-void NEAccumulateSquaredKernel::configure(const ITensor *input, uint32_t shift, ITensor *accum)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum);
-
-    set_shape_if_empty(*accum->info(), input->info()->tensor_shape());
-
-    set_format_if_unknown(*accum->info(), Format::S16);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(shift > 15);
-
-    _shift = shift;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration);
-}
-
-void NEAccumulateSquaredKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-    Iterator input(_input, window);
-    Iterator accum(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        acc_sq_v16_u8(input.ptr(), _shift, accum.ptr());
-    },
-    input, accum);
-}
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEAccumulateKernel.h b/src/core/NEON/kernels/NEAccumulateKernel.h
deleted file mode 100644
index af1298f53f..0000000000
--- a/src/core/NEON/kernels/NEAccumulateKernel.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEACCUMULATEKERNEL_H
-#define ARM_COMPUTE_NEACCUMULATEKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the accumulate kernel
- *
- * Accumulation is computed by:
- * @f[ accum(x,y) = accum(x,y) + input(x,y) @f]
- */
-class NEAccumulateKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateKernel";
-    }
-    /** Default constructor */
-    NEAccumulateKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateKernel(const NEAccumulateKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateKernel &operator=(const NEAccumulateKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAccumulateKernel(NEAccumulateKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAccumulateKernel &operator=(NEAccumulateKernel &&) = default;
-    /** Default destructor */
-    ~NEAccumulateKernel() = default;
-    /** Set the input and accumulation tensors
-     *
-     * @param[in]  input Source tensor. Data type supported: U8.
-     * @param[out] accum Destination tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-
-/** Interface for the accumulate weighted kernel
- *
- * Weighted accumulation is computed:
- * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f]
- *
- * Where @f$ 0 \le \alpha \le 1 @f$
- * Conceptually, the rounding for this is defined as:
- * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f]
-*/
-class NEAccumulateWeightedKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateWeightedKernel";
-    }
-    /** Default constructor */
-    NEAccumulateWeightedKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateWeightedKernel(const NEAccumulateWeightedKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateWeightedKernel &operator=(const NEAccumulateWeightedKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAccumulateWeightedKernel(NEAccumulateWeightedKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAccumulateWeightedKernel &operator=(NEAccumulateWeightedKernel &&) = default;
-    /** Default destructor */
-    ~NEAccumulateWeightedKernel() = default;
-    /** Set the input and accumulation tensors, and the scale value
-     *
-     * @param[in]     input Source tensor. Data type supported: U8.
-     * @param[in]     alpha Scalar value in the range [0.0f, 1.0f]
-     * @param[in,out] accum Accumulated tensor. Data type supported: U8.
-     */
-    void configure(const ITensor *input, float alpha, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    float _alpha;
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Interface for the accumulate weighted kernel using F16 */
-class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateWeightedFP16Kernel";
-    }
-    /** Default constructor */
-    NEAccumulateWeightedFP16Kernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateWeightedFP16Kernel(const NEAccumulateWeightedFP16Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateWeightedFP16Kernel &operator=(const NEAccumulateWeightedFP16Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAccumulateWeightedFP16Kernel(NEAccumulateWeightedFP16Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAccumulateWeightedFP16Kernel &operator=(NEAccumulateWeightedFP16Kernel &&) = default;
-    /** Default destructor */
-    ~NEAccumulateWeightedFP16Kernel() = default;
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Interface for the accumulate weighted kernel using F16 */
-using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-/** Interface for the accumulate squared kernel
- *
- * The accumulation of squares is computed:
- * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f]
- *
- * Where @f$ 0 \le shift \le 15 @f$
-*/
-class NEAccumulateSquaredKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEAccumulateSquaredKernel";
-    }
-    /** Default constructor */
-    NEAccumulateSquaredKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateSquaredKernel(const NEAccumulateSquaredKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEAccumulateSquaredKernel &operator=(const NEAccumulateSquaredKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEAccumulateSquaredKernel(NEAccumulateSquaredKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEAccumulateSquaredKernel &operator=(NEAccumulateSquaredKernel &&) = default;
-    /** Default destructor */
-    ~NEAccumulateSquaredKernel() = default;
-    /** Set the input and accumulation tensors and the shift value.
-     *
-     * @param[in]     input Source tensor. Data type supported: U8.
-     * @param[in]     shift Shift value in the range of [0, 15]
-     * @param[in,out] accum Accumulated tensor. Data type supported: S16.
-     */
-    void configure(const ITensor *input, uint32_t shift, ITensor *accum);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    uint32_t _shift;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEACCUMULATEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
deleted file mode 100644
index 2aa8aa8e99..0000000000
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ /dev/null
@@ -1,194 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEBox3x3Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-int16x8_t calculate_kernel(const uint8x16_t &top_data, const uint8x16_t &mid_data, const uint8x16_t &bot_data)
-{
-    const int16x8x2_t top_s16 =
-    {
-        {
-            vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-            vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-        }
-    };
-    const int16x8x2_t mid_s16 =
-    {
-        {
-            vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
-            vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
-        }
-    };
-    const int16x8x2_t bot_s16 =
-    {
-        {
-            vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-            vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-        }
-    };
-
-    //top left
-    int16x8_t out = top_s16.val[0];
-    //top mid
-    out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1));
-    //top right
-    out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
-    //mid left
-    out = vaddq_s16(out, mid_s16.val[0]);
-    //mid mid
-    out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1));
-    //mid right
-    out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2));
-    //bot left
-    out = vaddq_s16(out, bot_s16.val[0]);
-    //bot mid
-    out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1));
-    //bot right
-    out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
-    return out;
-}
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void NEBox3x3FP16Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
-    unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
-    unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
-
-    const float16x8_t oneovernine = vdupq_n_f16(1.0f / 9.0f);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-        int16x8_t out = calculate_kernel(top_data, mid_data, bot_data);
-
-        float16x8_t outfloat = vcvtq_f16_s16(out);
-        outfloat             = vmulq_f16(outfloat, oneovernine);
-
-        vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(outfloat)));
-    },
-    input, output);
-}
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-BorderSize NEBox3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEBox3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    set_format_if_unknown(*input->info(), Format::U8);
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-    constexpr int          rect_offset_xy                    = -1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win, AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration), output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEBox3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
-    unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
-    unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
-
-    const int       shift       = 19;
-    int             value       = (1 << shift) / 9 + 1; //58255 / (2^19) ~= 1/9
-    const int32x4_t oneovernine = vdupq_n_s32(value);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-        int16x8_t out = calculate_kernel(top_data, mid_data, bot_data);
-
-        int32x4_t outfloathigh = vmovl_s16(vget_high_s16(out));
-        int32x4_t outfloatlow  = vmovl_s16(vget_low_s16(out));
-
-        outfloathigh = vmulq_s32(outfloathigh, oneovernine);
-        outfloatlow  = vmulq_s32(outfloatlow, oneovernine);
-        outfloathigh = vshrq_n_s32(outfloathigh, shift);
-        outfloatlow  = vshrq_n_s32(outfloatlow, shift);
-        out          = vcombine_s16(vqmovn_s32((outfloatlow)),
-                                    vqmovn_s32((outfloathigh)));
-
-        vst1_u8(output.ptr(), vqmovun_s16(out));
-    },
-    input, output);
-}
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.h b/src/core/NEON/kernels/NEBox3x3Kernel.h
deleted file mode 100644
index 4f9ac18219..0000000000
--- a/src/core/NEON/kernels/NEBox3x3Kernel.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEBOX3x3KERNEL_H
-#define ARM_COMPUTE_NEBOX3x3KERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform a Box 3x3 filter */
-class NEBox3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBox3x3Kernel";
-    }
-    /** Default constructor */
-    NEBox3x3Kernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBox3x3Kernel(const NEBox3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBox3x3Kernel &operator=(const NEBox3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBox3x3Kernel(NEBox3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBox3x3Kernel &operator=(NEBox3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NEBox3x3Kernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-/** Neon kernel to perform a Box 3x3 filter for FP16 datatype
- */
-class NEBox3x3FP16Kernel : public NEBox3x3Kernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEBox3x3FP16Kernel";
-    }
-    /** Default constructor */
-    NEBox3x3FP16Kernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBox3x3FP16Kernel(const NEBox3x3FP16Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEBox3x3FP16Kernel &operator=(const NEBox3x3FP16Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEBox3x3FP16Kernel(NEBox3x3FP16Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEBox3x3FP16Kernel &operator=(NEBox3x3FP16Kernel &&) = default;
-    /** Default destructor */
-    ~NEBox3x3FP16Kernel() = default;
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-};
-#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-/** Neon kernel to perform a Box 3x3 filter for FP16 datatype */
-using NEBox3x3FP16Kernel = NEBox3x3Kernel;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEBOX3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp
deleted file mode 100644
index 7a2bf20c04..0000000000
--- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp
+++ /dev/null
@@ -1,1122 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NECannyEdgeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-namespace arm_compute
-{
-namespace
-{
-constexpr int NO_EDGE = 0;
-constexpr int EDGE    = 255;
-constexpr int MAYBE   = 127;
-
-inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
-    // Constant use for evaluating score1 and score3
-    static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f);
-    static const float32x4_t zero    = vdupq_n_f32(0.0f);
-    static const float32x4_t one     = vdupq_n_f32(1.0f);
-    static const float32x4_t two     = vdupq_n_f32(2.0f);
-    static const float32x4_t three   = vdupq_n_f32(3.0f);
-
-    // Score0: (1, 0)
-    const float32x4x2_t score0 =
-    {
-        {
-            vabsq_f32(gx.val[0]),
-            vabsq_f32(gx.val[1])
-        }
-    };
-
-    // Score2: ( 0, 1 )
-    const float32x4x2_t score2 =
-    {
-        {
-            vabsq_f32(gy.val[0]),
-            vabsq_f32(gy.val[1])
-        }
-    };
-
-    // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 )
-    float32x4x2_t score1 =
-    {
-        {
-            vmulq_f32(gy.val[0], const45),
-            vmulq_f32(gy.val[1], const45)
-        }
-    };
-
-    float32x4x2_t score3 = score1;
-
-    score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45);
-    score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45);
-    score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45);
-    score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45);
-
-    score1.val[0] = vabsq_f32(score1.val[0]);
-    score1.val[1] = vabsq_f32(score1.val[1]);
-    score3.val[0] = vabsq_f32(score3.val[0]);
-    score3.val[1] = vabsq_f32(score3.val[1]);
-
-    float32x4x2_t phase =
-    {
-        {
-            zero,
-            zero
-        }
-    };
-
-    float32x4x2_t old_score = score0;
-
-    // score1 > old_score?
-    uint32x4x2_t mask =
-    {
-        {
-            vcgtq_f32(score1.val[0], old_score.val[0]),
-            vcgtq_f32(score1.val[1], old_score.val[1])
-        }
-    };
-
-    phase.val[0]     = vbslq_f32(mask.val[0], one, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], one, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]);
-
-    // score2 > old_score?
-    mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]);
-    mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]);
-
-    phase.val[0]     = vbslq_f32(mask.val[0], two, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], two, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]);
-
-    // score3 > old_score?
-    mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]);
-    mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]);
-
-    phase.val[0]     = vbslq_f32(mask.val[0], three, phase.val[0]);
-    phase.val[1]     = vbslq_f32(mask.val[1], three, phase.val[1]);
-    old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]);
-    old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]);
-
-    // Convert from float32x4_t to uint8x8_t
-    return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])),
-                                  vmovn_u32(vcvtq_u32_f32(phase.val[1]))));
-}
-
-/* Computes the gradient phase if gradient_size = 3 or 5. The output is quantized.
- * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    // Convert to float
-    const float32x4x2_t gx_f32 =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
-        }
-    };
-
-    const float32x4x2_t gy_f32 =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
-        }
-    };
-
-    return phase_quantization(gx_f32, gy_f32);
-}
-
-/* Computes the gradient phase if gradient_size = 7. The output is quantized.
- * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return quantized phase for 8 pixels
- */
-inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    // Convert to float
-    const float32x4x2_t gx_f32 =
-    {
-        {
-            vcvtq_f32_s32(gx.val[0]),
-            vcvtq_f32_s32(gx.val[1])
-        }
-    };
-
-    const float32x4x2_t gy_f32 =
-    {
-        {
-            vcvtq_f32_s32(gy.val[0]),
-            vcvtq_f32_s32(gy.val[1])
-        }
-    };
-
-    return phase_quantization(gx_f32, gy_f32);
-}
-
-/* Computes the magnitude using the L1-norm type if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)),
-                     vreinterpretq_u16_s16(vabsq_s16(gy)));
-}
-
-/* Computes the magnitude using the L1-norm type if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    const uint32x4x2_t gx_abs =
-    {
-        {
-            vreinterpretq_u32_s32(vabsq_s32(gx.val[0])),
-            vreinterpretq_u32_s32(vabsq_s32(gx.val[1]))
-        }
-    };
-
-    const uint32x4x2_t gy_abs =
-    {
-        {
-            vreinterpretq_u32_s32(vabsq_s32(gy.val[0])),
-            vreinterpretq_u32_s32(vabsq_s32(gy.val[1]))
-        }
-    };
-
-    const uint32x4x2_t output =
-    {
-        {
-            vaddq_u32(gx_abs.val[0], gy_abs.val[0]),
-            vaddq_u32(gx_abs.val[1], gy_abs.val[1])
-        }
-    };
-
-    return output;
-}
-
-inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy)
-{
-    // x^2 ...
-    float32x4x2_t magnitude =
-    {
-        {
-            vmulq_f32(gx.val[0], gx.val[0]),
-            vmulq_f32(gx.val[1], gx.val[1])
-        }
-    };
-
-    // ... + y^2
-    magnitude.val[0] = vmlaq_f32(magnitude.val[0], gy.val[0], gy.val[0]);
-    magnitude.val[1] = vmlaq_f32(magnitude.val[1], gy.val[1], gy.val[1]);
-
-    // sqrt(...)
-    magnitude.val[0] = vmulq_f32(vrsqrteq_f32(magnitude.val[0]), magnitude.val[0]);
-    magnitude.val[1] = vmulq_f32(vrsqrteq_f32(magnitude.val[1]), magnitude.val[1]);
-
-    return magnitude;
-}
-
-/* Computes the magnitude using L2-norm if gradient_size = 3 or 5
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy)
-{
-    // Compute magnitude using L2 normalization
-    const float32x4x2_t gx2 =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx)))
-        }
-    };
-
-    const float32x4x2_t gy2 =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy)))
-        }
-    };
-
-    const float32x4x2_t magnitude = mag_l2(gx2, gy2);
-
-    // Store magnitude - Convert to uint16x8
-    return vcombine_u16(vmovn_u32(vcvtq_u32_f32(magnitude.val[0])),
-                        vmovn_u32(vcvtq_u32_f32(magnitude.val[1])));
-}
-
-/* Computes the magnitude using L2-norm if gradient_size = 7
- *
- * @param[in] gx Gx component
- * @param[in] gy Gy component
- *
- * @return magnitude for 8 pixels
- */
-inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy)
-{
-    // Compute magnitude using L2 normalization
-    float32x4x2_t gx2 =
-    {
-        {
-            vcvtq_f32_s32(gx.val[0]),
-            vcvtq_f32_s32(gx.val[1])
-        }
-    };
-
-    float32x4x2_t gy2 =
-    {
-        {
-            vcvtq_f32_s32(gy.val[0]),
-            vcvtq_f32_s32(gy.val[1])
-        }
-    };
-
-    const float32x4x2_t magnitude = mag_l2(gx2, gy2);
-    const uint32x4x2_t  mag32 =
-    {
-        {
-            vcvtq_u32_f32(magnitude.val[0]),
-            vcvtq_u32_f32(magnitude.val[1])
-        }
-    };
-
-    return mag32;
-}
-
-/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm
- *
- * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S16
- * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S16
- * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
-{
-    const auto gx        = static_cast<const int16_t *__restrict>(gx_ptr);
-    const auto gy        = static_cast<const int16_t *__restrict>(gy_ptr);
-    const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
-    const auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
-
-    const int16x8x4_t gx_val =
-    {
-        {
-            vld1q_s16(gx),
-            vld1q_s16(gx + 8),
-            vld1q_s16(gx + 16),
-            vld1q_s16(gx + 24)
-        }
-    };
-
-    const int16x8x4_t gy_val =
-    {
-        {
-            vld1q_s16(gy),
-            vld1q_s16(gy + 8),
-            vld1q_s16(gy + 16),
-            vld1q_s16(gy + 24)
-        }
-    };
-
-    // Compute and store phase
-    vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
-    vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
-    vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
-    vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
-
-    // Compute ans store magnitude using L1 normalization
-    vst1q_u16(magnitude + 0, mag_l1_S16_S16(gx_val.val[0], gy_val.val[0]));
-    vst1q_u16(magnitude + 8, mag_l1_S16_S16(gx_val.val[1], gy_val.val[1]));
-    vst1q_u16(magnitude + 16, mag_l1_S16_S16(gx_val.val[2], gy_val.val[2]));
-    vst1q_u16(magnitude + 24, mag_l1_S16_S16(gx_val.val[3], gy_val.val[3]));
-}
-
-/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm
- *
- * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S16
- * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S16
- * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16
- * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
-{
-    const auto gx        = static_cast<const int16_t *__restrict>(gx_ptr);
-    const auto gy        = static_cast<const int16_t *__restrict>(gy_ptr);
-    const auto magnitude = static_cast<uint16_t *__restrict>(magnitude_ptr);
-    const auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
-
-    const int16x8x4_t gx_val =
-    {
-        {
-            vld1q_s16(gx),
-            vld1q_s16(gx + 8),
-            vld1q_s16(gx + 16),
-            vld1q_s16(gx + 24)
-        }
-    };
-
-    const int16x8x4_t gy_val =
-    {
-        {
-            vld1q_s16(gy),
-            vld1q_s16(gy + 8),
-            vld1q_s16(gy + 16),
-            vld1q_s16(gy + 24)
-        }
-    };
-
-    // Compute and store phase
-    vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0]));
-    vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1]));
-    vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2]));
-    vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3]));
-
-    // Compute and store magnitude using L2 normalization
-    vst1q_u16(magnitude + 0, mag_l2_S16_S16(gx_val.val[0], gy_val.val[0]));
-    vst1q_u16(magnitude + 8, mag_l2_S16_S16(gx_val.val[1], gy_val.val[1]));
-    vst1q_u16(magnitude + 16, mag_l2_S16_S16(gx_val.val[2], gy_val.val[2]));
-    vst1q_u16(magnitude + 24, mag_l2_S16_S16(gx_val.val[3], gy_val.val[3]));
-}
-
-/* Gradient function used when the gradient size = 7 and when the norm_type = L1-norm
- *
- * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S32
- * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S32
- * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type support U8
- */
-void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
-{
-    auto gx        = static_cast<const int32_t *__restrict>(gx_ptr);
-    auto gy        = static_cast<const int32_t *__restrict>(gy_ptr);
-    auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
-    auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
-
-    // Process low and high part
-    for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
-    {
-        const int32x4x2_t gx0 =
-        {
-            {
-                vld1q_s32(gx + 0),
-                vld1q_s32(gx + 4)
-            }
-        };
-
-        const int32x4x2_t gx1 =
-        {
-            {
-                vld1q_s32(gx + 8),
-                vld1q_s32(gx + 12)
-            }
-        };
-
-        const int32x4x2_t gy0 =
-        {
-            {
-                vld1q_s32(gy + 0),
-                vld1q_s32(gy + 4)
-            }
-        };
-
-        const int32x4x2_t gy1 =
-        {
-            {
-                vld1q_s32(gy + 8),
-                vld1q_s32(gy + 12)
-            }
-        };
-
-        // Compute and store phase
-        vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
-        vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
-
-        // Compute magnitude using L1 normalization
-        const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0);
-        const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1);
-
-        // Store magnitude
-        vst1q_u32(magnitude + 0, mag0.val[0]);
-        vst1q_u32(magnitude + 4, mag0.val[1]);
-        vst1q_u32(magnitude + 8, mag1.val[0]);
-        vst1q_u32(magnitude + 12, mag1.val[1]);
-    }
-}
-
-/* Gradient function used when the gradient size = 7 and when the norm_type = L2-norm
- *
- * @param[in]  gx_ptr        Pointer to source image. Gx image. Data type supported S32
- * @param[in]  gy_ptr        Pointer to source image. Gy image. Data type supported S32
- * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32
- * @param[out] phase_ptr     Pointer to destination image. Quantized phase. Data type supported U8
- */
-void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr)
-{
-    auto gx        = static_cast<const int32_t *__restrict>(gx_ptr);
-    auto gy        = static_cast<const int32_t *__restrict>(gy_ptr);
-    auto magnitude = static_cast<uint32_t *__restrict>(magnitude_ptr);
-    auto phase     = static_cast<uint8_t *__restrict>(phase_ptr);
-
-    // Process low and high part
-    for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16)
-    {
-        const int32x4x2_t gx0 =
-        {
-            {
-                vld1q_s32(gx + 0),
-                vld1q_s32(gx + 4)
-            }
-        };
-
-        const int32x4x2_t gx1 =
-        {
-            {
-                vld1q_s32(gx + 8),
-                vld1q_s32(gx + 12)
-            }
-        };
-
-        const int32x4x2_t gy0 =
-        {
-            {
-                vld1q_s32(gy + 0),
-                vld1q_s32(gy + 4)
-            }
-        };
-
-        const int32x4x2_t gy1 =
-        {
-            {
-                vld1q_s32(gy + 8),
-                vld1q_s32(gy + 12)
-            }
-        };
-
-        // Compute and store phase
-        vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0));
-        vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1));
-
-        // Compute magnitude using L2 normalization
-        const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0);
-        const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1);
-
-        // Store magnitude
-        vst1q_u32(magnitude + 0, mag0.val[0]);
-        vst1q_u32(magnitude + 4, mag0.val[1]);
-        vst1q_u32(magnitude + 8, mag1.val[0]);
-        vst1q_u32(magnitude + 12, mag1.val[1]);
-    }
-}
-
-/* Computes non-maxima suppression and hysteresis when the gradient size = 3 or 5
- *
- * @param[in]  magnitude_ptr Pointer to source image. Magnitude. Data type supported U16
- * @param[in]  phase_ptr     Pointer to source image. Quantized phase. Data type supported U8
- * @param[out] output_ptr    Pointer to output image. Data type supported U8
- * @param[in]  stride_mag    Stride of magnitude image
- * @param[in]  lower_thr     Lower threshold used for the hysteresis
- * @param[in]  upper_thr     Upper threshold used for the hysteresis
- */
-void non_max_suppression_U16_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
-                                   const int32_t upper_thr)
-{
-    const auto magnitude = static_cast<const uint16_t *__restrict>(magnitude_ptr);
-    const auto phase     = static_cast<const uint8_t *__restrict>(phase_ptr);
-    const auto output    = static_cast<uint8_t *__restrict>(output_ptr);
-
-    // Get magnitude and phase of the centre pixels
-    uint16x8_t mc = vld1q_u16(magnitude);
-
-    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
-    const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
-
-    // 0 degree
-    const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1);
-    const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1);
-    uint16x8_t       mask0 = vceqq_u16(pc16, vdupq_n_u16(0));
-    mask0                  = vandq_u16(mask0, vcgtq_u16(mc, mk0_0));
-    mask0                  = vandq_u16(mask0, vcgtq_u16(mc, mk0_1));
-
-    // 45 degree
-    const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1);
-    const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1);
-    uint16x8_t       mask1  = vceqq_u16(pc16, vdupq_n_u16(1));
-    mask1                   = vandq_u16(mask1, vcgtq_u16(mc, mk45_0));
-    mask1                   = vandq_u16(mask1, vcgtq_u16(mc, mk45_1));
-
-    // 90 degree
-    const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag);
-    const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag);
-    uint16x8_t       mask2  = vceqq_u16(pc16, vdupq_n_u16(2));
-    mask2                   = vandq_u16(mask2, vcgtq_u16(mc, mk90_0));
-    mask2                   = vandq_u16(mask2, vcgtq_u16(mc, mk90_1));
-
-    // 135 degree
-    const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1);
-    const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1);
-    uint16x8_t       mask3   = vceqq_u16(pc16, vdupq_n_u16(3));
-    mask3                    = vandq_u16(mask3, vcgtq_u16(mc, mk135_0));
-    mask3                    = vandq_u16(mask3, vcgtq_u16(mc, mk135_1));
-
-    // Merge masks
-    mask0 = vorrq_u16(mask0, mask1);
-    mask2 = vorrq_u16(mask2, mask3);
-    mask0 = vorrq_u16(mask0, mask2);
-
-    mc = vbslq_u16(mask0, mc, vdupq_n_u16(0));
-
-    // mc > upper_thr
-    mask0 = vcgtq_u16(mc, vdupq_n_u16(upper_thr));
-
-    // mc <= lower_thr
-    mask1 = vcleq_u16(mc, vdupq_n_u16(lower_thr));
-
-    // mc <= upper_thr && mc > lower_thr
-    mask2 = vcleq_u16(mc, vdupq_n_u16(upper_thr));
-    mask2 = vandq_u16(mask2, vcgtq_u16(mc, vdupq_n_u16(lower_thr)));
-
-    mc = vbslq_u16(mask0, vdupq_n_u16(EDGE), mc);
-    mc = vbslq_u16(mask1, vdupq_n_u16(NO_EDGE), mc);
-    mc = vbslq_u16(mask2, vdupq_n_u16(MAYBE), mc);
-
-    vst1_u8(output, vmovn_u16(mc));
-}
-
-inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr)
-{
-    // Phase for 4 pixel
-    const uint32x4_t pc32 = vmovl_u16(pc);
-
-    // Get magnitude for 4 pixel
-    uint32x4_t mc = vld1q_u32(input);
-
-    // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135°
-    // 0 degree
-    const uint32x4_t mk0_0 = vld1q_u32(input - 1);
-    const uint32x4_t mk0_1 = vld1q_u32(input + 1);
-    uint32x4_t       mask0 = vceqq_u32(pc32, vdupq_n_u32(0));
-    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_0));
-    mask0                  = vandq_u32(mask0, vcgtq_u32(mc, mk0_1));
-
-    // 45 degree
-    const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1);
-    const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1);
-    uint32x4_t       mask1  = vceqq_u32(pc32, vdupq_n_u32(1));
-    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_0));
-    mask1                   = vandq_u32(mask1, vcgtq_u32(mc, mk45_1));
-
-    // 90 degree
-    const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag);
-    const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag);
-    uint32x4_t       mask2  = vceqq_u32(pc32, vdupq_n_u32(2));
-    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_0));
-    mask2                   = vandq_u32(mask2, vcgtq_u32(mc, mk90_1));
-
-    // 135 degree
-    const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1);
-    const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1);
-    uint32x4_t       mask3   = vceqq_u32(pc32, vdupq_n_u32(3));
-    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_0));
-    mask3                    = vandq_u32(mask3, vcgtq_u32(mc, mk135_1));
-
-    // Merge masks
-    mask0 = vorrq_u32(mask0, mask1);
-    mask2 = vorrq_u32(mask2, mask3);
-    mask0 = vorrq_u32(mask0, mask2);
-
-    mc = vbslq_u32(mask0, mc, vdupq_n_u32(0));
-
-    // mc > upper_thr
-    mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr));
-
-    // mc <= lower_thr
-    mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr));
-
-    // mc <= upper_thr && mc > lower_thr
-    mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr));
-    mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr)));
-
-    mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc);
-    mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc);
-    mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc);
-
-    return vmovn_u32(mc);
-}
-
-/* Computes non-maxima suppression and hysteresis when the gradient_size = 7
- *
- * @param[in]  magnitude_ptr Pointer to source image. Magnitude. Data type supported U32
- * @param[in]  phase_ptr     Pointer to source image. Quantized phase. Data type supported U8
- * @param[out] output_ptr    Pointer to destination image. Data type supported U8
- * @param[in]  stride_mag    Stride of magnitude image
- * @param[in]  lower_thr     Lower threshold used for the hysteresis
- * @param[in]  upper_thr     Upper threshold used for the hysteresis
- */
-void non_max_suppression_U32_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr,
-                                   const int32_t upper_thr)
-{
-    const auto magnitude = static_cast<const uint32_t *__restrict>(magnitude_ptr);
-    const auto phase     = static_cast<const uint8_t *__restrict>(phase_ptr);
-    const auto output    = static_cast<uint8_t *__restrict>(output_ptr);
-
-    // Get phase for 8 pixel
-    const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase));
-
-    // Compute non maxima suppression
-    const uint16x4x2_t res =
-    {
-        {
-            non_max_U32_helper(magnitude, vget_low_u16(pc16), stride_mag, lower_thr, upper_thr),
-            non_max_U32_helper(magnitude + 4, vget_high_u16(pc16), stride_mag, lower_thr, upper_thr)
-        }
-    };
-
-    // Store result
-    vst1_u8(output, vmovn_u16(vcombine_u16(res.val[0], res.val[1])));
-}
-
-/* Computes edge tracing when is called by edge_trace_U8_U8 recursively
- *
- * @param[in]  input         Pointer to source image. Data type supported U8
- * @param[out] output        Pointer to destination image. Data type supported U8
- * @param[in]  input_stride  Stride of the input image
- * @param[in]  output_stride Stride of the output image
- */
-void edge_trace_recursive_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride)
-{
-    // Look for MAYBE pixels in 8 directions
-    *output = EDGE;
-
-    // (-1, 0)
-    uint8_t pixel = *(input - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(input - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
-    }
-
-    // (+1, 0)
-    pixel = *(input + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(input + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
-    }
-
-    input -= input_stride;
-    output -= output_stride;
-
-    // (-1, -1)
-    pixel = *(input - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(input - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
-    }
-
-    // (0, -1)
-    pixel = *input;
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *input = EDGE;
-
-        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
-    }
-
-    // (+1, -1)
-    pixel = *(input + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(input + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
-    }
-
-    input += input_stride * 2;
-    output += output_stride * 2;
-
-    // (-1, +1)
-    pixel = *(input - 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(input - 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride);
-    }
-
-    // (0, +1)
-    pixel = *input;
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *input = EDGE;
-
-        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
-    }
-
-    // (+1, +1)
-    pixel = *(input + 1);
-
-    if(pixel == MAYBE)
-    {
-        // Touched a MAYBE point. MAYBE becomes EDGE
-        *(input + 1) = EDGE;
-
-        edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride);
-    }
-}
-
-/* Computes edge tracing
- *
- * @param[in]  input         Pointer to source image. Data type supported U8
- * @param[out] output        Pointer to destination image. Data type supported U8
- * @param[in]  input_stride  Stride of the input image
- * @param[in]  output_stride Stride of the output image
- */
-void edge_trace_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride)
-{
-    if(*input == NO_EDGE)
-    {
-        *output = NO_EDGE;
-    }
-    // Check if EDGE and not yet touched
-    else if((*input == EDGE) && (*output == NO_EDGE))
-    {
-        edge_trace_recursive_U8_U8(input, output, input_stride, output_stride);
-    }
-}
-} // namespace
-
-NEGradientKernel::~NEGradientKernel() = default;
-
-NEGradientKernel::NEGradientKernel()
-    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
-{
-}
-
-void NEGradientKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase);
-
-    set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape());
-    set_shape_if_empty(*phase->info(), gx->info()->tensor_shape());
-
-    Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32;
-    set_format_if_unknown(*magnitude->info(), magnitude_format);
-    set_format_if_unknown(*phase->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy);
-    ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy");
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    if(_gx->info()->data_type() == DataType::S16)
-    {
-        if(norm_type == 1)
-        {
-            _func = &mag_phase_l1norm_S16_S16_U16_U8;
-        }
-        else
-        {
-            _func = &mag_phase_l2norm_S16_S16_U16_U8;
-        }
-    }
-    else
-    {
-        if(norm_type == 1)
-        {
-            _func = &mag_phase_l1norm_S32_S32_U32_U8;
-        }
-        else
-        {
-            _func = &mag_phase_l2norm_S32_S32_U32_U8;
-        }
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 32;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-
-    ARM_COMPUTE_UNUSED(update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access));
-
-    mag_access.set_valid_region(win, _gx->info()->valid_region());
-    phase_access.set_valid_region(win, _gx->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-
-void NEGradientKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    Iterator gx(_gx, window);
-    Iterator gy(_gy, window);
-    Iterator magnitude(_magnitude, window);
-    Iterator phase(_phase, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        (*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr());
-    },
-    gx, gy, magnitude, phase);
-}
-
-NEEdgeNonMaxSuppressionKernel::~NEEdgeNonMaxSuppressionKernel() = default;
-NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel()
-    : _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0)
-{
-}
-
-BorderSize NEEdgeNonMaxSuppressionKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEEdgeNonMaxSuppressionKernel::configure(const ITensor *magnitude, const ITensor *phase, ITensor *output,
-                                              int32_t upper_thr, int32_t lower_thr, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output);
-
-    set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape());
-
-    set_format_if_unknown(*phase->info(), Format::U8);
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output);
-
-    _magnitude = magnitude;
-    _phase     = phase;
-    _output    = output;
-
-    switch(_magnitude->info()->data_type())
-    {
-        case DataType::U16:
-            _func = &non_max_suppression_U16_U8_U8;
-            break;
-        case DataType::U32:
-            _func = &non_max_suppression_U32_U8_U8;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type!");
-    }
-
-    // Set thresholds
-    _lower_thr = lower_thr;
-    _upper_thr = upper_thr;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 10;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowRectangle  mag_access(_magnitude->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-    AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, mag_access, phase_access, output_access);
-
-    output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEEdgeNonMaxSuppressionKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    Iterator magnitude(_magnitude, window);
-    Iterator phase(_phase, window);
-    Iterator output(_output, window);
-
-    const size_t input1_stride        = _magnitude->info()->strides_in_bytes()[1];
-    const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        (*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr);
-    },
-    magnitude, phase, output);
-}
-
-NEEdgeTraceKernel::~NEEdgeTraceKernel() = default;
-NEEdgeTraceKernel::NEEdgeTraceKernel()
-    : _input(nullptr), _output(nullptr)
-{
-}
-
-BorderSize NEEdgeTraceKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-bool NEEdgeTraceKernel::is_parallelisable() const
-{
-    return false;
-}
-
-void NEEdgeTraceKernel::configure(ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    set_format_if_unknown(*input->info(), Format::U8);
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-
-    const ValidRegion &input_valid_region  = input->info()->valid_region();
-    const ValidRegion &output_valid_region = output->info()->valid_region();
-
-    // Reads can occur within the valid region of the input + border
-    AccessWindowStatic input_access(input->info(),
-                                    input_valid_region.anchor[0] - border_size().left,
-                                    input_valid_region.anchor[1] - border_size().top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
-
-    // Writes can occur within the valid region of the output + border
-    AccessWindowStatic output_access(output->info(),
-                                     output_valid_region.anchor[0] - border_size().left,
-                                     output_valid_region.anchor[1] - border_size().top,
-                                     output_valid_region.anchor[0] + output_valid_region.shape[0] + border_size().right,
-                                     output_valid_region.anchor[1] + output_valid_region.shape[1] + border_size().bottom);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, _input->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-
-void NEEdgeTraceKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    const size_t input_stride  = _input->info()->strides_in_bytes()[1];
-    const size_t output_stride = _output->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride);
-    },
-    input, output);
-}
-} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.h b/src/core/NEON/kernels/NECannyEdgeKernel.h
deleted file mode 100644
index f1d24410f7..0000000000
--- a/src/core/NEON/kernels/NECannyEdgeKernel.h
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECANNYEDGEKERNEL_H
-#define ARM_COMPUTE_NECANNYEDGEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Computes magnitude and quantised phase from inputs gradients. */
-class NEGradientKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGradientKernel";
-    }
-    /** Default constructor */
-    NEGradientKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGradientKernel(const NEGradientKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGradientKernel &operator=(const NEGradientKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGradientKernel(NEGradientKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGradientKernel &operator=(NEGradientKernel &&) = default;
-    /** Default destructor */
-    ~NEGradientKernel();
-
-    /** Initialise the kernel's sources, destinations and border mode.
-     *
-     * @note gx, gy and magnitude must all be the same size (either 16 or 32)
-     *
-     * @param[in]  gx        Source tensor - Gx component. Data type supported: S16/S32.
-     * @param[in]  gy        Source tensor - Gy component. Data type supported: same as @p gx.
-     * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32).
-     * @param[out] phase     Destination tensor - Quantized phase. Data type supported: U8.
-     * @param[in]  norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm
-     */
-    virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
-    /** Common signature for all the specialised gradient functions
-     *
-     * @param[in]  gx_ptr        Pointer to the first input tensor.
-     * @param[in]  gy_ptr        Pointer to the second input tensor.
-     * @param[out] magnitude_ptr Pointer to the first output tensor
-     * @param[out] phase_ptr     Pointer to the second output tensor
-     */
-    using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr);
-
-    GradientFunction *_func;      /**< Gradient function to use for the particular tensor types passed to configure() */
-    const ITensor    *_gx;        /**< Source tensor - Gx component */
-    const ITensor    *_gy;        /**< Source tensor - Gy component */
-    ITensor          *_magnitude; /**< Destination tensor - Magnitude */
-    ITensor          *_phase;     /**< Destination tensor - Quantized phase */
-};
-
-/** Neon kernel to perform Non-Maxima suppression for Canny Edge.
- *
- * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input
- *       to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE.
- *
- * @note Hysteresis is computed in @ref NEEdgeTraceKernel
- */
-class NEEdgeNonMaxSuppressionKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEEdgeNonMaxSuppressionKernel";
-    }
-    /** Default constructor */
-    NEEdgeNonMaxSuppressionKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default;
-    /** Default destructor */
-    ~NEEdgeNonMaxSuppressionKernel();
-
-    /** Initialise the kernel's sources, destination and border mode.
-     *
-     * @param[in]  magnitude        Source tensor - Magnitude. Data type supported: U16/U32.
-     * @param[in]  phase            Source tensor - Quantized phase. Data type supported: U8.
-     * @param[out] output           Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge"
-     * @param[in]  upper_thr        Upper threshold used for the hysteresis
-     * @param[in]  lower_thr        Lower threshold used for the hysteresis
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Common signature for all the specialised non-maxima suppression functions
-     *
-     * @param[in]  magnitude_ptr Pointer to the first input tensor.
-     * @param[in]  phase_ptr     Pointer to the second input tensor.
-     * @param[out] output_ptr    Pointer to the output tensor
-     * @param[in]  stride_mag    Stride of the magnitude tensor
-     * @param[in]  upper_thr     Upper threshold used for the hysteresis
-     * @param[in]  lower_thr     Lower threshold used for the hysteresis
-     */
-    using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr,
-                                         const int32_t lower_thr);
-
-    EdgeNonMaxSupprFunction *_func;      /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */
-    const ITensor           *_magnitude; /**< Source tensor - Magnitude */
-    const ITensor           *_phase;     /**< Source tensor - Quantized phase */
-    ITensor                 *_output;    /**< Destination tensor */
-    int32_t                  _lower_thr; /**< Lower threshold used for the hysteresis */
-    int32_t                  _upper_thr; /**< Upper threshold used for the hysteresis */
-};
-
-/** Neon kernel to perform Edge tracing */
-class NEEdgeTraceKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEEdgeTraceKernel";
-    }
-    /** Default constructor */
-    NEEdgeTraceKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default;
-    /** Default destructor */
-    ~NEEdgeTraceKernel();
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in,out] input  Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge"
-     * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge).
-     */
-    void configure(ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-    bool       is_parallelisable() const override;
-
-private:
-    ITensor *_input;  /**< Source tensor */
-    ITensor *_output; /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECANNYEDGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp
deleted file mode 100644
index 6bfd4c5bda..0000000000
--- a/src/core/NEON/kernels/NEChannelCombineKernel.cpp
+++ /dev/null
@@ -1,456 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEChannelCombineKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-NEChannelCombineKernel::NEChannelCombineKernel()
-    : _func(nullptr), _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }, _num_elems_processed_per_iteration(8),
-_is_parallelizable(true)
-{
-}
-
-void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
-    ARM_COMPUTE_ERROR_ON(plane0 == output);
-    ARM_COMPUTE_ERROR_ON(plane1 == output);
-    ARM_COMPUTE_ERROR_ON(plane2 == output);
-
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
-    const Format output_format = output->info()->format();
-
-    // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
-    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
-    {
-        // Validate Y plane of input and output
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
-
-        // Validate U and V plane of the input
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-    }
-
-    _planes[0] = plane0;
-    _planes[1] = plane1;
-    _planes[2] = plane2;
-    _planes[3] = nullptr;
-
-    // Validate the last input tensor only for RGBA format
-    if(Format::RGBA8888 == output_format)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(plane3);
-        ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane3);
-
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane3, Format::U8);
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8);
-
-        _planes[3] = plane3;
-    }
-
-    _output       = output;
-    _output_multi = nullptr;
-
-    // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
-    if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
-    {
-        _x_subsampling[1] = 2;
-        _x_subsampling[2] = 2;
-    }
-
-    _num_elems_processed_per_iteration = 8;
-    _is_parallelizable                 = true;
-
-    // Select function and number of elements to process given the output format
-    switch(output_format)
-    {
-        case Format::RGB888:
-            _func = &NEChannelCombineKernel::combine_3C;
-            break;
-        case Format::RGBA8888:
-            _func = &NEChannelCombineKernel::combine_4C;
-            break;
-        case Format::UYVY422:
-            _num_elems_processed_per_iteration = 16;
-            _func                              = &NEChannelCombineKernel::combine_YUV_1p<true>;
-            break;
-        case Format::YUYV422:
-            _num_elems_processed_per_iteration = 16;
-            _func                              = &NEChannelCombineKernel::combine_YUV_1p<false>;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported format.");
-            break;
-    }
-
-    Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
-    AccessWindowHorizontal plane0_access(plane0->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[0]);
-    AccessWindowHorizontal plane1_access(plane1->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[1]);
-    AccessWindowHorizontal plane2_access(plane2->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[2]);
-    AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, _num_elems_processed_per_iteration);
-
-    update_window_and_padding(
-        win,
-        plane0_access,
-        plane1_access,
-        plane2_access,
-        plane3_access,
-        output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(),
-                                                       plane1->info()->valid_region(),
-                                                       plane2->info()->valid_region());
-
-    if(plane3 != nullptr)
-    {
-        valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
-    }
-
-    output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2);
-
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane0, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane1, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(plane2, Format::U8);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8);
-
-    const Format output_format = output->info()->format();
-
-    // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
-    // Perform validation only for formats which require sub-sampling.
-    if(Format::YUV444 != output_format)
-    {
-        // Validate Y plane of input and output
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
-
-        // Validate U and V plane of the input
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
-
-        // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
-        // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
-
-        // Validate the last plane V of format IYUV
-        if(Format::IYUV == output_format)
-        {
-            // Validate Y plane of the output
-            ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
-        }
-    }
-
-    _planes[0]    = plane0;
-    _planes[1]    = plane1;
-    _planes[2]    = plane2;
-    _planes[3]    = nullptr;
-    _output       = nullptr;
-    _output_multi = output;
-
-    bool         has_two_planes           = false;
-    unsigned int num_elems_written_plane1 = 8;
-
-    _num_elems_processed_per_iteration = 8;
-    _is_parallelizable                 = true;
-
-    switch(output_format)
-    {
-        case Format::NV12:
-        case Format::NV21:
-            _x_subsampling           = { { 1, 2, 2 } };
-            _y_subsampling           = { { 1, 2, 2 } };
-            _func                    = &NEChannelCombineKernel::combine_YUV_2p;
-            has_two_planes           = true;
-            num_elems_written_plane1 = 16;
-            break;
-        case Format::IYUV:
-            _is_parallelizable = false;
-            _x_subsampling     = { { 1, 2, 2 } };
-            _y_subsampling     = { { 1, 2, 2 } };
-            _func              = &NEChannelCombineKernel::combine_YUV_3p;
-            break;
-        case Format::YUV444:
-            _is_parallelizable = false;
-            _x_subsampling     = { { 1, 1, 1 } };
-            _y_subsampling     = { { 1, 1, 1 } };
-            _func              = &NEChannelCombineKernel::combine_YUV_3p;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported format.");
-            break;
-    }
-
-    const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
-
-    Window                win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
-    AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
-    AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
-    AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
-                              AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
-                              AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
-                              output_plane0_access,
-                              output_plane1_access,
-                              output_plane2_access);
-
-    ValidRegion plane0_valid_region  = plane0->info()->valid_region();
-    ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
-
-    output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
-    output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
-    output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-bool NEChannelCombineKernel::is_parallelisable() const
-{
-    return _is_parallelizable;
-}
-
-void NEChannelCombineKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-
-void NEChannelCombineKernel::combine_3C(const Window &win)
-{
-    Iterator p0(_planes[0], win);
-    Iterator p1(_planes[1], win);
-    Iterator p2(_planes[2], win);
-    Iterator out(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
-        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
-        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-
-        const uint8x8x3_t pixels =
-        {
-            {
-                vld1_u8(p0_ptr),
-                vld1_u8(p1_ptr),
-                vld1_u8(p2_ptr)
-            }
-        };
-
-        vst3_u8(out_ptr, pixels);
-    },
-    p0, p1, p2, out);
-}
-
-void NEChannelCombineKernel::combine_4C(const Window &win)
-{
-    Iterator p0(_planes[0], win);
-    Iterator p1(_planes[1], win);
-    Iterator p2(_planes[2], win);
-    Iterator p3(_planes[3], win);
-    Iterator out(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
-        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
-        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
-        const auto p3_ptr  = static_cast<uint8_t *>(p3.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-
-        const uint8x8x4_t pixels =
-        {
-            {
-                vld1_u8(p0_ptr),
-                vld1_u8(p1_ptr),
-                vld1_u8(p2_ptr),
-                vld1_u8(p3_ptr)
-            }
-        };
-
-        vst4_u8(out_ptr, pixels);
-    },
-    p0, p1, p2, p3, out);
-}
-
-template <bool is_uyvy>
-void NEChannelCombineKernel::combine_YUV_1p(const Window &win)
-{
-    // Create sub-sampled uv window and init uv planes
-    Window win_uv(win);
-    win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
-    win_uv.validate();
-
-    Iterator p0(_planes[0], win);
-    Iterator p1(_planes[1], win_uv);
-    Iterator p2(_planes[2], win_uv);
-    Iterator out(_output, win);
-
-    constexpr auto shift = is_uyvy ? 1 : 0;
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto p0_ptr  = static_cast<uint8_t *>(p0.ptr());
-        const auto p1_ptr  = static_cast<uint8_t *>(p1.ptr());
-        const auto p2_ptr  = static_cast<uint8_t *>(p2.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-
-        const uint8x8x2_t pixels_y = vld2_u8(p0_ptr);
-        const uint8x8x2_t pixels_uv =
-        {
-            {
-                vld1_u8(p1_ptr),
-                vld1_u8(p2_ptr)
-            }
-        };
-
-        uint8x8x4_t pixels{ {} };
-        pixels.val[0 + shift] = pixels_y.val[0];
-        pixels.val[1 - shift] = pixels_uv.val[0];
-        pixels.val[2 + shift] = pixels_y.val[1];
-        pixels.val[3 - shift] = pixels_uv.val[1];
-
-        vst4_u8(out_ptr, pixels);
-    },
-    p0, p1, p2, out);
-}
-
-void NEChannelCombineKernel::combine_YUV_2p(const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[1]);
-    ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[1]);
-
-    // Copy first plane
-    copy_plane(win, 0);
-
-    // Update UV window
-    Window uv_win(win);
-    uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
-    uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
-    uv_win.validate();
-
-    // Update output win
-    Window out_win(win);
-    out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
-    out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
-    out_win.validate();
-
-    // Construct second plane
-    const int shift = (Format::NV12 == _output_multi->info()->format()) ? 0 : 1;
-    Iterator  p1(_planes[1 + shift], uv_win);
-    Iterator  p2(_planes[2 - shift], uv_win);
-    Iterator  out(_output_multi->plane(1), out_win);
-
-    // Increase step size after iterator is created to calculate stride correctly for multi channel format
-    out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
-
-    execute_window_loop(out_win, [&](const Coordinates &)
-    {
-        const uint8x8x2_t pixels =
-        {
-            {
-                vld1_u8(p1.ptr()),
-                vld1_u8(p2.ptr())
-            }
-        };
-
-        vst2_u8(out.ptr(), pixels);
-    },
-    p1, p2, out);
-}
-
-void NEChannelCombineKernel::combine_YUV_3p(const Window &win)
-{
-    copy_plane(win, 0);
-    copy_plane(win, 1);
-    copy_plane(win, 2);
-}
-
-void NEChannelCombineKernel::copy_plane(const Window &win, uint32_t plane_id)
-{
-    ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[plane_id]);
-    ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[plane_id]);
-
-    // Update window
-    Window tmp_win(win);
-    tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
-    tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
-
-    Iterator in(_planes[plane_id], tmp_win);
-    Iterator out(_output_multi->plane(plane_id), tmp_win);
-
-    execute_window_loop(tmp_win, [&](const Coordinates &)
-    {
-        const uint8x8_t pixels = vld1_u8(in.ptr());
-
-        vst1_u8(out.ptr(), pixels);
-    },
-    in, out);
-}
diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.h b/src/core/NEON/kernels/NEChannelCombineKernel.h
deleted file mode 100644
index a3372be4d2..0000000000
--- a/src/core/NEON/kernels/NEChannelCombineKernel.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
-#define ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-#include <array>
-#include <cstdint>
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the channel combine kernel */
-class NEChannelCombineKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelCombineKernel";
-    }
-    /** Default constructor */
-    NEChannelCombineKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelCombineKernel(const NEChannelCombineKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelCombineKernel(NEChannelCombineKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelCombineKernel() = default;
-
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[in]  plane3 The 2D plane that forms channel 3. Data type supported: U8
-     * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     */
-    void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output);
-    /** Configure function's inputs and outputs.
-     *
-     * @param[in]  plane0 The 2D plane that forms channel 0. Data type supported: U8
-     * @param[in]  plane1 The 2D plane that forms channel 1. Data type supported: U8
-     * @param[in]  plane2 The 2D plane that forms channel 2. Data type supported: U8
-     * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444
-     */
-    void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Combine 3 planes to form a three channel single plane tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_3C(const Window &win);
-    /** Combine 4 planes to form a four channel single plane tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_4C(const Window &win);
-    /** Combine 3 planes to form a single plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    template <bool is_yuyv>
-    void combine_YUV_1p(const Window &win);
-    /** Combine 3 planes to form a two plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_YUV_2p(const Window &win);
-    /** Combine 3 planes to form a three plane YUV tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void combine_YUV_3p(const Window &win);
-    /** Copies a full plane to the output tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void copy_plane(const Window &win, uint32_t plane_id);
-    /** Common signature for all the specialised ChannelCombine functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window);
-    /** ChannelCombine function to use for the particular tensor types passed to configure() */
-    ChannelCombineFunction _func;
-    std::array<const ITensor *, 4> _planes;
-    ITensor     *_output;
-    IMultiImage *_output_multi;
-    std::array<uint32_t, 3> _x_subsampling;
-    std::array<uint32_t, 3> _y_subsampling;
-    unsigned int _num_elems_processed_per_iteration;
-    bool         _is_parallelizable;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp
deleted file mode 100644
index d0d1c6852f..0000000000
--- a/src/core/NEON/kernels/NEChannelExtractKernel.cpp
+++ /dev/null
@@ -1,269 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEChannelExtractKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-NEChannelExtractKernel::NEChannelExtractKernel()
-    : _func(nullptr), _lut_index(0)
-{
-}
-
-void NEChannelExtractKernel::configure(const ITensor *input, Channel channel, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON(input == output);
-
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    // Check if input tensor has a valid format
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    // Check if channel is valid for given format
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
-    unsigned int subsampling = 1;
-
-    if(format == Format::YUYV422 || format == Format::UYVY422)
-    {
-        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input);
-
-        if(channel != Channel::Y)
-        {
-            subsampling = 2;
-        }
-    }
-
-    TensorShape output_shape = calculate_subsampled_shape(input->info()->tensor_shape(), format, channel);
-    set_shape_if_empty(*output->info(), output_shape);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
-
-    _input     = input;
-    _output    = output;
-    _lut_index = channel_idx_from_format(format, channel);
-
-    unsigned int num_elems_processed_per_iteration = 16;
-
-    if(format == Format::YUYV422 || format == Format::UYVY422)
-    {
-        _func = &NEChannelExtractKernel::extract_1C_from_2C_img;
-
-        if(channel != Channel::Y) // Channel::U or Channel::V
-        {
-            num_elems_processed_per_iteration = 32;
-            _func                             = &NEChannelExtractKernel::extract_YUYV_uv;
-        }
-    }
-    else // Format::RGB888 or Format::RGBA8888
-    {
-        _func = &NEChannelExtractKernel::extract_1C_from_3C_img;
-
-        if(format == Format::RGBA8888)
-        {
-            _func = &NEChannelExtractKernel::extract_1C_from_4C_img;
-        }
-    }
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling);
-    update_window_and_padding(win, input_access, output_access);
-
-    ValidRegion input_valid_region = input->info()->valid_region();
-    output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEChannelExtractKernel::configure(const IMultiImage *input, Channel channel, IImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    set_format_if_unknown(*output->info(), Format::U8);
-
-    const Format format = input->info()->format();
-    ARM_COMPUTE_ERROR_ON_CHANNEL_NOT_IN_KNOWN_FORMAT(format, channel);
-
-    // Get input plane
-    const IImage *input_plane = input->plane(plane_idx_from_channel(format, channel));
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_plane);
-
-    if(Channel::Y == channel && format != Format::YUV444)
-    {
-        // Check if the width of the tensor shape is even for formats with subsampled channels (UYVY422 and YUYV422)
-        ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(format, input_plane);
-    }
-
-    // Calculate 2x2 subsampled tensor shape
-    TensorShape output_shape = calculate_subsampled_shape(input->plane(0)->info()->tensor_shape(), format, channel);
-    set_shape_if_empty(*output->info(), output_shape);
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output_shape, output->info()->tensor_shape());
-
-    // Check if input tensor has a valid format
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444);
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8);
-
-    _input     = input_plane;
-    _output    = output;
-    _lut_index = channel_idx_from_format(format, channel);
-
-    unsigned int num_elems_processed_per_iteration = 32;
-
-    _func = &NEChannelExtractKernel::copy_plane;
-
-    if((format == Format::NV12 || format == Format::NV21) && channel != Channel::Y)
-    {
-        num_elems_processed_per_iteration = 16;
-        _func                             = &NEChannelExtractKernel::extract_1C_from_2C_img;
-    }
-
-    Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, _input->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-
-void NEChannelExtractKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-
-void NEChannelExtractKernel::extract_1C_from_2C_img(const Window &win)
-{
-    Iterator in(_input, win);
-    Iterator out(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-        const auto pixels  = vld2q_u8(in_ptr);
-        vst1q_u8(out_ptr, pixels.val[_lut_index]);
-    },
-    in, out);
-}
-
-void NEChannelExtractKernel::extract_1C_from_3C_img(const Window &win)
-{
-    Iterator in(_input, win);
-    Iterator out(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-        const auto pixels  = vld3q_u8(in_ptr);
-        vst1q_u8(out_ptr, pixels.val[_lut_index]);
-    },
-    in, out);
-}
-
-void NEChannelExtractKernel::extract_1C_from_4C_img(const Window &win)
-{
-    Iterator in(_input, win);
-    Iterator out(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-        const auto pixels  = vld4q_u8(in_ptr);
-        vst1q_u8(out_ptr, pixels.val[_lut_index]);
-    },
-    in, out);
-}
-
-void NEChannelExtractKernel::extract_YUYV_uv(const Window &win)
-{
-    ARM_COMPUTE_ERROR_ON(win.x().step() % 2);
-
-    Window win_out(win);
-    win_out.set_dimension_step(Window::DimX, win.x().step() / 2);
-
-    Iterator in(_input, win);
-    Iterator out(_output, win_out);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-        const auto pixels  = vld4q_u8(in_ptr);
-        vst1q_u8(out_ptr, pixels.val[_lut_index]);
-    },
-    in, out);
-}
-
-void NEChannelExtractKernel::copy_plane(const Window &win)
-{
-    Iterator in(_input, win);
-    Iterator out(_output, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = static_cast<uint8_t *>(in.ptr());
-        const auto out_ptr = static_cast<uint8_t *>(out.ptr());
-        vst4_u8(out_ptr, vld4_u8(in_ptr));
-    },
-    in, out);
-}
diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.h b/src/core/NEON/kernels/NEChannelExtractKernel.h
deleted file mode 100644
index 0b2847d79c..0000000000
--- a/src/core/NEON/kernels/NEChannelExtractKernel.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
-#define ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the channel extract kernel */
-class NEChannelExtractKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEChannelExtractKernel";
-    }
-    /** Default constructor */
-    NEChannelExtractKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelExtractKernel(const NEChannelExtractKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEChannelExtractKernel(NEChannelExtractKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default;
-    /** Default destructor */
-    ~NEChannelExtractKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Destination tensor. Format supported: U8
-     */
-    void configure(const ITensor *input, Channel channel, ITensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input   Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444
-     * @param[in]  channel Channel to extract.
-     * @param[out] output  Single-planar destination image. Format supported: U8
-     */
-    void configure(const IMultiImage *input, Channel channel, IImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Extract one channel from a two channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_2C_img(const Window &win);
-    /** Extract one channel from a three channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_3C_img(const Window &win);
-    /** Extract one channel from a four channel planar tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_1C_from_4C_img(const Window &win);
-    /** Extract U/V channel from a single planar YUVY/UYVY tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void extract_YUYV_uv(const Window &win);
-    /** Copies a full plane to the output tensor.
-     *
-     * @param[in] win Region on which to execute the kernel.
-     */
-    void copy_plane(const Window &win);
-    /** Common signature for all the specialised ChannelExtract functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window);
-    /** ChannelExtract function to use for the particular tensor types passed to configure() */
-    ChannelExtractFunction _func;
-    unsigned int           _lut_index;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp
deleted file mode 100644
index 23270d42d1..0000000000
--- a/src/core/NEON/kernels/NEColorConvertKernel.cpp
+++ /dev/null
@@ -1,590 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEColorConvertKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/IMultiImage.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/MultiImageInfo.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/NEON/kernels/detail/NEColorConvertHelper.inl"
-
-using namespace arm_compute;
-
-NEColorConvertKernel::NEColorConvertKernel()
-    : _input(nullptr), _output(nullptr), _func(nullptr)
-{
-}
-
-void NEColorConvertKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    set_shape_if_empty(*output->info(), input->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-
-    switch(input->info()->format())
-    {
-        case Format::RGBA8888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    _func                             = colorconvert_rgbx_to_rgb;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    _func                             = colorconvert_yuyv_to_rgb<false, false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::RGBA8888:
-                    _func                             = colorconvert_yuyv_to_rgb<false, true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::YUYV422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    _func                             = colorconvert_yuyv_to_rgb<true, false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::RGBA8888:
-                    _func                             = colorconvert_yuyv_to_rgb<true, true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::RGB888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGBA8888:
-                    _func                             = colorconvert_rgb_to_rgbx;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                case Format::U8:
-                    _func                             = colorconvert_rgb_to_u8;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-
-void NEColorConvertKernel::configure(const IMultiImage *input, IImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-
-    set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape());
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-
-    switch(input->info()->format())
-    {
-        case Format::NV12:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    _func                             = colorconvert_nv12_to_rgb<true, false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::RGBA8888:
-                    _func                             = colorconvert_nv12_to_rgb<true, true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::NV21:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    _func                             = colorconvert_nv12_to_rgb<false, false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::RGBA8888:
-                    _func                             = colorconvert_nv12_to_rgb<false, true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::IYUV:
-        {
-            switch(output->info()->format())
-            {
-                case Format::RGB888:
-                    _func                             = colorconvert_iyuv_to_rgb<false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::RGBA8888:
-                    _func                             = colorconvert_iyuv_to_rgb<true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    win.set_dimension_step(Window::DimY, 2);
-
-    unsigned int input_plane_count = 3;
-
-    if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21)
-    {
-        input_plane_count = 2;
-    }
-
-    AccessWindowHorizontal input0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  input1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f);
-    AccessWindowRectangle  input2_access(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f);
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              input0_access, input1_access, input2_access,
-                              output_access);
-
-    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(),
-                                                           input->plane(1)->info()->valid_region());
-
-    if(input_plane_count == 3)
-    {
-        intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region());
-    }
-
-    output_access.set_valid_region(win, intersect_region);
-
-    INEKernel::configure(win);
-}
-
-void NEColorConvertKernel::configure(const IImage *input, IMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-
-    set_shape_if_empty(*output->plane(0)->info(), input->info()->tensor_shape());
-
-    switch(output->info()->format())
-    {
-        case Format::NV12:
-        {
-            TensorShape subsampled_shape = input->info()->tensor_shape();
-            subsampled_shape.set(0, subsampled_shape[0] / 2);
-            subsampled_shape.set(1, subsampled_shape[1] / 2);
-
-            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
-            break;
-        }
-        case Format::IYUV:
-        {
-            TensorShape subsampled_shape = input->info()->tensor_shape();
-            subsampled_shape.set(0, subsampled_shape[0] / 2);
-            subsampled_shape.set(1, subsampled_shape[1] / 2);
-
-            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
-            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
-            break;
-        }
-        case Format::YUV444:
-            set_shape_if_empty(*output->plane(1)->info(), input->info()->tensor_shape());
-            set_shape_if_empty(*output->plane(2)->info(), input->info()->tensor_shape());
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(1));
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(2));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(0));
-
-    unsigned int num_elems_processed_per_iteration = 0;
-
-    switch(input->info()->format())
-    {
-        case Format::RGB888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                    _func                             = colorconvert_rgb_to_nv12<false>;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                case Format::IYUV:
-                    _func                             = colorconvert_rgb_to_iyuv<false>;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                case Format::YUV444:
-                    _func                             = colorconvert_rgb_to_yuv4<false>;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::RGBA8888:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                    _func                             = colorconvert_rgb_to_nv12<true>;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                case Format::IYUV:
-                    _func                             = colorconvert_rgb_to_iyuv<true>;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                case Format::YUV444:
-                    _func                             = colorconvert_rgb_to_yuv4<true>;
-                    num_elems_processed_per_iteration = 16;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::UYVY422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                    _func                             = colorconvert_yuyv_to_nv12<false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::IYUV:
-                    _func                             = colorconvert_yuyv_to_iyuv<false>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::YUYV422:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                    _func                             = colorconvert_yuyv_to_nv12<true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                case Format::IYUV:
-                    _func                             = colorconvert_yuyv_to_iyuv<true>;
-                    num_elems_processed_per_iteration = 32;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    float sub_sampling = 1.f;
-
-    if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444))
-    {
-        win.set_dimension_step(Window::DimY, 2);
-        sub_sampling = 0.5f;
-    }
-
-    unsigned int output_plane_count = 3;
-
-    if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21)
-    {
-        output_plane_count = 2;
-    }
-
-    AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-    AccessWindowRectangle  output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output0_access,
-                              output1_access,
-                              output2_access);
-
-    output0_access.set_valid_region(win, input->info()->valid_region());
-    output1_access.set_valid_region(win, input->info()->valid_region());
-    output2_access.set_valid_region(win, input->info()->valid_region());
-
-    INEKernel::configure(win);
-}
-
-void NEColorConvertKernel::configure(const IMultiImage *input, IMultiImage *output)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON(input == output);
-
-    set_shape_if_empty(*output->plane(0)->info(), input->plane(0)->info()->tensor_shape());
-
-    switch(output->info()->format())
-    {
-        case Format::NV12:
-        {
-            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
-            subsampled_shape.set(0, subsampled_shape[0] / 2);
-            subsampled_shape.set(1, subsampled_shape[1] / 2);
-
-            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
-            break;
-        }
-        case Format::IYUV:
-        {
-            TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape();
-            subsampled_shape.set(0, subsampled_shape[0] / 2);
-            subsampled_shape.set(1, subsampled_shape[1] / 2);
-
-            set_shape_if_empty(*output->plane(1)->info(), subsampled_shape);
-            set_shape_if_empty(*output->plane(2)->info(), subsampled_shape);
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape());
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape());
-            break;
-        }
-        case Format::YUV444:
-            set_shape_if_empty(*output->plane(1)->info(), input->plane(0)->info()->tensor_shape());
-            set_shape_if_empty(*output->plane(2)->info(), input->plane(0)->info()->tensor_shape());
-
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(1));
-            ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(2));
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(0));
-
-    switch(input->info()->format())
-    {
-        case Format::NV12:
-        {
-            switch(output->info()->format())
-            {
-                case Format::IYUV:
-                    _func = colorconvert_nv12_to_iyuv<true>;
-                    break;
-                case Format::YUV444:
-                    _func = colorconvert_nv12_to_yuv4<true>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::NV21:
-        {
-            switch(output->info()->format())
-            {
-                case Format::IYUV:
-                    _func = colorconvert_nv12_to_iyuv<false>;
-                    break;
-                case Format::YUV444:
-                    _func = colorconvert_nv12_to_yuv4<false>;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        case Format::IYUV:
-        {
-            switch(output->info()->format())
-            {
-                case Format::NV12:
-                    _func = colorconvert_iyuv_to_nv12;
-                    break;
-                case Format::YUV444:
-                    _func = colorconvert_iyuv_to_yuv4;
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-                    break;
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
-    }
-
-    _input  = input;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 32;
-    constexpr float        input_sub_sampling                = 0.5f;
-    const float            output_sub_sampling               = output->info()->format() == Format::YUV444 ? 1.f : 0.5f;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->plane(0)->info(), Steps(num_elems_processed_per_iteration));
-    win.set_dimension_step(Window::DimY, 2);
-
-    unsigned int input_plane_count = 3;
-
-    if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21)
-    {
-        input_plane_count = 2;
-    }
-
-    unsigned int output_plane_count = 3;
-
-    if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21)
-    {
-        output_plane_count = 2;
-    }
-
-    AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowRectangle  output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling);
-    AccessWindowRectangle  output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->plane(0)->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowRectangle(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling),
-                              AccessWindowRectangle(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling),
-                              output0_access,
-                              output1_access,
-                              output2_access);
-
-    ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(),
-                                                           input->plane(1)->info()->valid_region());
-
-    if(input_plane_count == 3)
-    {
-        intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region());
-    }
-
-    output0_access.set_valid_region(win, intersect_region);
-    output1_access.set_valid_region(win, intersect_region);
-    output2_access.set_valid_region(win, intersect_region);
-
-    INEKernel::configure(win);
-}
-
-void NEColorConvertKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (*_func)(_input, _output, window);
-}
diff --git a/src/core/NEON/kernels/NEColorConvertKernel.h b/src/core/NEON/kernels/NEColorConvertKernel.h
deleted file mode 100644
index 1adb624aae..0000000000
--- a/src/core/NEON/kernels/NEColorConvertKernel.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_COLORCONVERTKERNEL_H
-#define ARM_COMPUTE_COLORCONVERTKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class IMultiImage;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the color convert kernel */
-class NEColorConvertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEColorConvertKernel";
-    }
-    /** Default constructor */
-    NEColorConvertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEColorConvertKernel(const NEColorConvertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEColorConvertKernel(NEColorConvertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default;
-    /** Default destructor */
-    ~NEColorConvertKernel() = default;
-
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888
-     * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422),
-     *                                                          RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/),
-     *                                                          U8 (if the formats of @p input is RGB888)
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888
-     */
-    void configure(const IMultiImage *input, IImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422
-     * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888)
-     */
-    void configure(const IImage *input, IMultiImage *output);
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Multi-planar source image. Formats supported: NV12/NV21/IYUV
-     * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of  @p input is IYUV)
-     */
-    void configure(const IMultiImage *input, IMultiImage *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win);
-    const void           *_input;
-    void                 *_output;
-    ColorConvertFunction *_func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NECOLORCONVERTKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp
deleted file mode 100644
index e5780ea264..0000000000
--- a/src/core/NEON/kernels/NEDerivativeKernel.cpp
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEDerivativeKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-NEDerivativeKernel::NEDerivativeKernel()
-    : _func(nullptr), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
-{
-}
-
-BorderSize NEDerivativeKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEDerivativeKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    const bool run_der_x = output_x != nullptr;
-    const bool run_der_y = output_y != nullptr;
-
-    if(run_der_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(run_der_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-
-    AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration);
-
-    // TODO(COMPMID-1503) Fix x-access input bug in Neon kernel instead of '+2'
-    AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration + 2);
-    AccessWindowRectangle  in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration);
-
-    // TODO(COMPMID-1503) Fix x-access input bug in Neon kernel instead of '+2'
-    AccessWindowRectangle in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration + 2, num_rows_read_per_iteration);
-
-    if(run_der_x && run_der_y)
-    {
-        _func = &NEDerivativeKernel::derivative_xy;
-        update_window_and_padding(win, in_xy_access, out_x_access, out_y_access);
-        out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-        out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    }
-    else
-    {
-        if(run_der_x)
-        {
-            _func = &NEDerivativeKernel::derivative_x;
-            update_window_and_padding(win, in_x_access, out_x_access);
-            out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-        }
-        else if(run_der_y)
-        {
-            _func = &NEDerivativeKernel::derivative_y;
-            update_window_and_padding(win, in_y_access, out_y_access);
-            out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
-        }
-    }
-
-    INEKernel::configure(win);
-}
-
-void NEDerivativeKernel::derivative_x(const Window &window)
-{
-    Iterator in(_input, window);
-    Iterator out_x(_output_x, window);
-
-    /* Apply 1-D centered point discrete derivative mask ([-1 0 1]) along the X direction */
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Load left and right data */
-        const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
-        const uint8x16_t r_data = vld1q_u8(in.ptr() + 1);
-
-        /* Cast to int16 and perform the subtraction between the right and left data */
-        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data))));
-
-        /* Cast to int16 and perform the subtraction between the right and left data */
-        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data))));
-
-        /* Store result of derivative along the X direction */
-        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()), out0);
-        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()) + 8, out1);
-    },
-    in, out_x);
-}
-
-void NEDerivativeKernel::derivative_y(const Window &window)
-{
-    Iterator in(_input, window);
-    Iterator out_y(_output_y, window);
-
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    /* Apply 1-D centered point discrete derivative mask ([-1 0 1]^T) along the Y direction */
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Load top and bottom data */
-        const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
-        const uint8x16_t b_data = vld1q_u8(in.ptr() + stride);
-
-        /* Cast to int16 and perform the subtraction between the bottom and top data */
-        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data))));
-
-        /* Cast to int16 and perform the subtraction between the bottom and top data */
-        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data))));
-
-        /* Store result of derivative along the Y direction */
-        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()), out0);
-        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()) + 8, out1);
-    },
-    in, out_y);
-}
-
-void NEDerivativeKernel::derivative_xy(const Window &window)
-{
-    Iterator in(_input, window);
-    Iterator out_x(_output_x, window);
-    Iterator out_y(_output_y, window);
-
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    /* Apply 1-D centered point discrete derivative masks ([-1 0 1] and [-1 0 1]^T) along the X and Y directions */
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Load top, bottom, left and right data */
-        const uint8x16_t t_data = vld1q_u8(in.ptr() - stride);
-        const uint8x16_t b_data = vld1q_u8(in.ptr() + stride);
-        const uint8x16_t l_data = vld1q_u8(in.ptr() - 1);
-        const uint8x16_t r_data = vld1q_u8(in.ptr() + 1);
-
-        /* Cast to int16 and perform the subtraction between the bottom and top data */
-        const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data))));
-
-        /* Cast to int16 and perform the subtraction between the bottom and top data */
-        const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data))));
-
-        /* Cast to int16 and perform the subtraction between the right and left data */
-        const int16x8_t out2 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data))));
-
-        /* Cast to int16 and perform the subtraction between the right and left data */
-        const int16x8_t out3 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))),
-                                         vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data))));
-
-        /* Store result of derivative along the Y direction */
-        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()), out0);
-        vst1q_s16(reinterpret_cast<int16_t *>(out_y.ptr()) + 8, out1);
-
-        /* Store result of derivative along the X direction */
-        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()), out2);
-        vst1q_s16(reinterpret_cast<int16_t *>(out_x.ptr()) + 8, out3);
-    },
-    in, out_x, out_y);
-}
-
-void NEDerivativeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
diff --git a/src/core/NEON/kernels/NEDerivativeKernel.h b/src/core/NEON/kernels/NEDerivativeKernel.h
deleted file mode 100644
index 112b2b0b28..0000000000
--- a/src/core/NEON/kernels/NEDerivativeKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDERIVATIVEKERNEL_H
-#define ARM_COMPUTE_NEDERIVATIVEKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the derivative along the X/Y directions on a tensor.
- *
- */
-class NEDerivativeKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDerivativeKernel";
-    }
-    /** Default constructor */
-    NEDerivativeKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDerivativeKernel(const NEDerivativeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDerivativeKernel(NEDerivativeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default;
-    /** Default destructor */
-    ~NEDerivativeKernel() = default;
-    /** Initialise the kernel's sources, destination and border
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Function to perform derivative along the X direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_x(const Window &window);
-    /** Function to perform derivative along the Y direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_y(const Window &window);
-    /** Function to perform derivative along the X and Y direction on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void derivative_xy(const Window &window);
-    /** Common signature for all the specialised derivative functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window);
-    /** Derivative function to use for the particular tensor types passed to configure() */
-    DerivativeFunction _func;
-
-private:
-    const ITensor *_input;    /**< Input tensor */
-    ITensor       *_output_x; /**< Output tensor - Derivate along the X direction */
-    ITensor       *_output_y; /**< Output tensor - Derivate along the Y direction */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEDERIVATIVEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp
deleted file mode 100644
index dc9ec22c71..0000000000
--- a/src/core/NEON/kernels/NEDilateKernel.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEDilateKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-BorderSize NEDilateKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEDilateKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEDilateKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator in(_input, window);
-    Iterator out(_output, window);
-
-    const size_t in_stride = _input->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t         *in_ptr   = in.ptr() - 1;
-        const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride);
-        const uint8x16_t mid_data = vld1q_u8(in_ptr);
-        const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride);
-
-        uint8x8_t top_high_data = vget_high_u8(top_data);
-        uint8x8_t top_low_data  = vget_low_u8(top_data);
-
-        uint8x8_t mid_high_data = vget_high_u8(mid_data);
-        uint8x8_t mid_low_data  = vget_low_u8(mid_data);
-
-        uint8x8_t bot_high_data = vget_high_u8(bot_data);
-        uint8x8_t bot_low_data  = vget_low_u8(bot_data);
-
-        uint8x8_t p0;
-        uint8x8_t p1;
-
-        p0 = top_low_data;
-        p1 = vext_u8(top_low_data, top_high_data, 1);
-        p0 = vmax_u8(p0, p1);
-
-        p1 = vext_u8(top_low_data, top_high_data, 2);
-        p0 = vmax_u8(p0, p1);
-
-        p1 = mid_low_data;
-        p0 = vmax_u8(p0, p1);
-
-        p1 = vext_u8(mid_low_data, mid_high_data, 1);
-        p0 = vmax_u8(p0, p1);
-
-        p1 = vext_u8(mid_low_data, mid_high_data, 2);
-        p0 = vmax_u8(p0, p1);
-
-        p1 = bot_low_data;
-        p0 = vmax_u8(p0, p1);
-
-        p1 = vext_u8(bot_low_data, bot_high_data, 1);
-        p0 = vmax_u8(p0, p1);
-
-        p1 = vext_u8(bot_low_data, bot_high_data, 2);
-        p0 = vmax_u8(p0, p1);
-
-        vst1_u8(out.ptr(), p0);
-    },
-    in, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEDilateKernel.h b/src/core/NEON/kernels/NEDilateKernel.h
deleted file mode 100644
index f1d34318ed..0000000000
--- a/src/core/NEON/kernels/NEDilateKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEDILATEKERNEL_H
-#define ARM_COMPUTE_NEDILATEKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform boolean image dilatation */
-class NEDilateKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEDilateKernel";
-    }
-    /** Default constructor */
-    NEDilateKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDilateKernel(const NEDilateKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEDilateKernel &operator=(const NEDilateKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEDilateKernel(NEDilateKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEDilateKernel &operator=(NEDilateKernel &&) = default;
-    /** Default destructor */
-    ~NEDilateKernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEDILATEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp
deleted file mode 100644
index 171a6c828f..0000000000
--- a/src/core/NEON/kernels/NEErodeKernel.cpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEErodeKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-BorderSize NEErodeKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEErodeKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _input  = input;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEErodeKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator in(_input, window);
-    Iterator out(_output, window);
-
-    const size_t in_stride = _input->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t         *in_ptr   = in.ptr() - 1;
-        const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride);
-        const uint8x16_t mid_data = vld1q_u8(in_ptr);
-        const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride);
-
-        uint8x8_t top_high_data = vget_high_u8(top_data);
-        uint8x8_t top_low_data  = vget_low_u8(top_data);
-
-        uint8x8_t mid_high_data = vget_high_u8(mid_data);
-        uint8x8_t mid_low_data  = vget_low_u8(mid_data);
-
-        uint8x8_t bot_high_data = vget_high_u8(bot_data);
-        uint8x8_t bot_low_data  = vget_low_u8(bot_data);
-
-        uint8x8_t p0;
-        uint8x8_t p1;
-
-        p0 = top_low_data;
-        p1 = vext_u8(top_low_data, top_high_data, 1);
-        p0 = vmin_u8(p0, p1);
-
-        p1 = vext_u8(top_low_data, top_high_data, 2);
-        p0 = vmin_u8(p0, p1);
-
-        p1 = mid_low_data;
-        p0 = vmin_u8(p0, p1);
-
-        p1 = vext_u8(mid_low_data, mid_high_data, 1);
-        p0 = vmin_u8(p0, p1);
-
-        p1 = vext_u8(mid_low_data, mid_high_data, 2);
-        p0 = vmin_u8(p0, p1);
-
-        p1 = bot_low_data;
-        p0 = vmin_u8(p0, p1);
-
-        p1 = vext_u8(bot_low_data, bot_high_data, 1);
-        p0 = vmin_u8(p0, p1);
-
-        p1 = vext_u8(bot_low_data, bot_high_data, 2);
-        p0 = vmin_u8(p0, p1);
-
-        vst1_u8(out.ptr(), p0);
-    },
-    in, out);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEErodeKernel.h b/src/core/NEON/kernels/NEErodeKernel.h
deleted file mode 100644
index 54f286780b..0000000000
--- a/src/core/NEON/kernels/NEErodeKernel.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEERODEKERNEL_H
-#define ARM_COMPUTE_NEERODEKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform boolean image erosion */
-class NEErodeKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEErodeKernel";
-    }
-    /** Default constructor */
-    NEErodeKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEErodeKernel(const NEErodeKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEErodeKernel &operator=(const NEErodeKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEErodeKernel(NEErodeKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEErodeKernel &operator=(NEErodeKernel &&) = default;
-    /** Default destructor */
-    ~NEErodeKernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEERODEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp
deleted file mode 100644
index c9280d8dc0..0000000000
--- a/src/core/NEON/kernels/NEFastCornersKernel.cpp
+++ /dev/null
@@ -1,475 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEFastCornersKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstddef>
-#include <limits>
-
-using namespace arm_compute;
-
-NEFastCornersKernel::NEFastCornersKernel()
-    : INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false)
-{
-}
-
-namespace
-{
-constexpr size_t PERMUTATIONS = 16;
-constexpr size_t PERM_SIZE    = 16;
-
-inline uint8x8x2_t create_permutation_index(size_t k)
-{
-    ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS);
-
-    static const std::array<std::array<uint8_t, PERMUTATIONS>, PERM_SIZE> permutations_table{ { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 },
-            { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 },
-            { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 },
-            { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 },
-            { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 },
-            { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 },
-            { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 },
-            { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 },
-            { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 },
-            { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 },
-            { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 },
-            { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 },
-            { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 },
-            { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 },
-            { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 },
-            { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 }
-
-        } };
-
-    const uint8x8x2_t index =
-    {
-        {
-            vld1_u8(permutations_table[k].data()),
-            vld1_u8(permutations_table[k].data() + 8)
-        }
-    };
-
-    return index;
-}
-
-inline uint8x8x4_t create_circle_index_register()
-{
-    /*
-        This function creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P.
-
-        . . F 0 1 . . .
-        . E . . . 2 . .
-        D . . . . . 3 .
-        C . . P . . 4 .
-        B . . . . . 5 .
-        . A . . . 6 . .
-        . . 9 8 7 . . .
-
-        Where . is an irrelevant texel value
-
-        We want to retrieve all texels [0,F]
-
-        The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels()
-
-        The first table holds the top 4 rows of texels
-        . . F 0 1 . . .
-        . E . . . 2 . .
-        D . . . . . 3 .
-        C . . P . . 4 .
-
-        The second table the bottom 3 rows of texels
-        B . . . . . 5 .
-        . A . . . 6 . .
-        . . 9 8 7 . . .
-
-    */
-    static const std::array<uint8_t, 8> top_right =
-    {
-        /* The register r.val[0] will be used to retrieve these texels:
-        . . . 0 1 . . .
-        . . . . . 2 . .
-        . . . . . . 3 .
-        . . . . . . 4 .
-        */
-        3 /* top table, first row, elem 4, value 0 in the diagram above */,
-        4 /* top table, first row, elem 5, value 1 in the diagram above */,
-        13 /* top table, second row, elem 6, value 2 in the diagram above */,
-        22 /* top table, third row, elem 7, value 3 in the diagram above*/,
-        30 /* top table, fourth row, elem 7, value 4 in the diagram above*/,
-        255,
-        255,
-        255
-    };
-
-    static const std::array<uint8_t, 8> bottom_right =
-    {
-        /* The register r.val[1] will be used to retrieve these texels:
-        . . . . . . 5 .
-        . . . . . 6 . .
-        . . . . 7 . . .
-        */
-        255,
-        255,
-        255,
-        255,
-        255,
-        6 /* low table, first row, elem 7, value 5 in the diagram above*/,
-        13 /* low table, second row, elem 6, value 6 in the diagram above*/,
-        20 /* low table, third row, elem 5, value 7 in the diagram above*/
-    };
-
-    static const std::array<uint8_t, 8> top_left =
-    {
-        /* The register r.val[2] will be used to retrieve these texels:
-        . . F . . . . .
-        . E . . . . . .
-        D . . . . . . .
-        C . . . . . . .
-        */
-        255,
-        255,
-        255,
-        255,
-        24 /* top table, fourth row, elem 1, value C in the diagram above */,
-        16 /* top table, third row, elem 1, value D in the diagram above*/,
-        9 /* top table, second row, elem 2, value E in the diagram above*/,
-        2 /* top table, first row, elem 3, value F in the diagram above*/
-    };
-
-    static const std::array<uint8_t, 8> bottom_left =
-    {
-        /* The register r.val[3] will be used to retrieve these texels:
-        B . . . . . . .
-        . A . . . . . .
-        . . 9 8 . . . .
-        */
-        19 /* low table, third row, elem 4, value 8 in the diagram above */,
-        18 /* low table, third row, elem 3, value 9 in the diagram above */,
-        9 /* low table, second row, elem 2, value A in the diagram above */,
-        0 /* low table, first row, elem 1, value B in the diagram above */,
-        255,
-        255,
-        255,
-        255
-    };
-
-    const uint8x8x4_t reg =
-    {
-        {
-            vld1_u8(top_right.data()),
-            vld1_u8(bottom_right.data()),
-            vld1_u8(top_left.data()),
-            vld1_u8(bottom_left.data())
-        }
-    };
-
-    return reg;
-}
-
-inline uint8x16_t get_circle_texels(const uint8x8x4_t &index, const uint8x8x4_t &tbl_hi, const uint8x8x3_t &tbl_lo)
-{
-    /*
-        This function loads the 16 texels in the Bresenham circle of radius 3 into the register 'texels'.
-        The parameter 'index' is an array of indices which was previously setup in setup_circle_index_register().
-        tbl_hi and tbl_lo are the two tables holding the texels in the window [(-3,-3),(+3,+3)] for a given texel P
-    */
-    return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]),
-                       vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3]));
-}
-
-inline uint8x16_t get_permutation_texels(const uint8x8x2_t &permutation_index, const uint8x8x2_t &tbl_circle)
-{
-    /*
-        This function stores the 9 texels of a give permutation X in the neon register 'texels'
-
-        'tbl_circle' is a LUT with the texels 0 to F
-
-        . . F 0 1 . . .
-        . E . . . 2 . .
-        D . . . . . 3 .
-        C . . P . . 4 .
-        B . . . . . 5 .
-        . A . . . 6 . .
-        . . 9 8 7 . . .
-
-        'permutation_index' is one of the permutations below:
-
-        { 0, 1, 2, 3, 4, 5, 6, 7, 8},
-        { F, 0, 1, 2, 3, 4, 5, 6, 7},
-        { E, F, 0, 1, 2, 3, 4, 5, 6},
-        { D, E, F, 0, 1, 2, 3, 4, 5},
-        { C, D, E, F, 0, 1, 2, 3, 4},
-        { B, C, D, E, F, 0, 1, 2, 3},
-        { A, B, C, D, E, F, 0, 1, 2},
-        { 9, A, B, C, D, E, F, 0, 1},
-        { 8, 9, A, B, C, D, E, F, 0},
-        { 7, 8, 9, A, B, C, D, E, F},
-        { 6, 7, 8, 9, A, B, C, D, E},
-        { 5, 6, 7, 8, 9, A, B, C, D},
-        { 4, 5, 6, 7, 8, 9, A, B, C},
-        { 3, 4, 5, 6, 7, 8, 9, A, B},
-        { 2, 3, 4, 5, 6, 7, 8, 9, A},
-        { 1, 2, 3, 4, 5, 6, 7, 8, 9},
-    */
-    static const uint8x8_t perm_right = vdup_n_u8(255); // init to 255 so that vtbx preserves the original values of the lanes
-
-    return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]),
-                       vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1]));
-}
-
-inline bool is_permutation_brighter(const uint8x16_t &permutation, const uint8x16_t &pg)
-{
-    const uint8x16_t res_gt = vcgtq_u8(permutation, pg);
-
-    return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits<uint64_t>::max();
-}
-
-inline bool is_permutation_darker(const uint8x16_t &permutation, const uint8x16_t &pl)
-{
-    const uint8x16_t res_lt    = vcltq_u8(permutation, pl);
-    const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt);
-    const uint64_t   t3        = vgetq_lane_u64(u64res_lt, 0);
-    const uint64_t   t4        = vgetq_lane_u64(u64res_lt, 1);
-
-    return std::numeric_limits<uint64_t>::max() == t3 && 255 == t4;
-}
-
-inline bool is_permutation_corner(const uint8x16_t &permutation, const uint8x16_t &pg, const uint8x16_t &pl)
-{
-    return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl);
-}
-
-inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
-{
-    /*
-        This function determines whether the point 'p' is a corner.
-    */
-    uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
-    uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold));
-
-    bool corner_detected = false;
-
-    for(size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j)
-    {
-        const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels);
-        corner_detected            = is_permutation_corner(pe_texels, pg, pl);
-    }
-
-    return corner_detected;
-}
-
-inline uint8x8x2_t create_circle_tbl(const std::array<uint8_t *const __restrict, 7> &buffer, size_t in_offset, const uint8x8x4_t &circle_index_r)
-{
-    /*
-        This function builds a LUT holding the 16 texels in the Brensenham circle radius 3.
-        circle_index_r is a vector of 4 registers to retrieve the texels from the two tables mentioned above.
-    */
-
-    //Load the texels in the window [(x-3,y-3),(x+3,y+3)].
-    //The top 4 rows are loaded in tbl_hi and the low 3 rows in tbl_lo.
-    //These two tables are then used to retrieve the texels in the Bresenham circle of radius 3.
-    const uint8x8x4_t tbl_window_hi =
-    {
-        {
-            vld1_u8(buffer[0] + in_offset),
-            vld1_u8(buffer[1] + in_offset),
-            vld1_u8(buffer[2] + in_offset),
-            vld1_u8(buffer[3] + in_offset)
-        }
-    };
-
-    const uint8x8x3_t tbl_window_lo =
-    {
-        {
-            vld1_u8(buffer[4] + in_offset),
-            vld1_u8(buffer[5] + in_offset),
-            vld1_u8(buffer[6] + in_offset)
-        }
-    };
-
-    const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo);
-
-    const uint8x8x2_t tbl_circle_texels =
-    {
-        {
-            vget_low_u8(circle_texels),
-            vget_high_u8(circle_texels)
-        }
-    };
-
-    return tbl_circle_texels;
-}
-
-inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, std::array<uint8x8x2_t, PERMUTATIONS> &perm_indices)
-{
-    uint8_t b = 255;
-    uint8_t a = tolerance;
-
-    while(b - a > 1)
-    {
-        const uint16_t ab = a + b;
-        const uint8_t  c  = ab >> 1;
-
-        if(point_is_fast_corner(p, c, tbl_circle, perm_indices))
-        {
-            a = c;
-        }
-        else
-        {
-            b = c;
-        }
-    }
-
-    return a;
-}
-} // namespace
-
-BorderSize NEFastCornersKernel::border_size() const
-{
-    return BorderSize(3);
-}
-
-void NEFastCornersKernel::configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_MSG(border_undefined == false, "Not implemented");
-
-    _input               = input;
-    _output              = output;
-    _threshold           = threshold;
-    _non_max_suppression = non_max_suppression;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 1;
-    constexpr unsigned int num_rows_read_per_iteration       = 7;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-    AccessWindowRectangle  input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEFastCornersKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    std::array<uint8x8x2_t, PERMUTATIONS> perm_index{ {} };
-    /*
-        We use a LUT loaded with 7 rows of uint8_t from the input image [-3,-3]...[+3,+3] to retrieve the texels in the Brensenham circle radius 3 and put them in one neon register uint8x16_t.
-        The three lines below setup the neon index registers to get these texels out from the table
-    */
-    const uint8x8x4_t circle_index_r = create_circle_index_register();
-    /*
-        We put the 16 texels (circle) in a LUT to easily generate all the permutations. The for block below setups the indices for each permutation.
-    */
-    for(size_t k = 0; k < PERMUTATIONS; ++k)
-    {
-        perm_index[k] = create_permutation_index(k);
-    }
-
-    Iterator in(_input, window);
-    Iterator out(_output, window);
-
-    const std::array<uint8_t *const __restrict, 7> in_row
-    {
-        _input->ptr_to_element(Coordinates(-3, -3)),
-        _input->ptr_to_element(Coordinates(-3, -2)),
-        _input->ptr_to_element(Coordinates(-3, -1)),
-        _input->ptr_to_element(Coordinates(-3, 0)),
-        _input->ptr_to_element(Coordinates(-3, 1)),
-        _input->ptr_to_element(Coordinates(-3, 2)),
-        _input->ptr_to_element(Coordinates(-3, 3))
-    };
-
-    auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t b)
-    {
-        const bool p_is_in_ab = (a <= p) && (p <= b);
-        const bool q_is_in_ab = (a <= q) && (q <= b);
-        return p_is_in_ab && q_is_in_ab;
-    };
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const size_t  in_offset = in.offset();
-        const uint8_t p0        = *in.ptr();
-        const uint8_t b         = std::min(p0 + _threshold, 255);
-        const uint8_t a         = std::max(p0 - _threshold, 0);
-        uint8_t       score     = 0;
-        /*
-            Fast check to discard points which cannot be corners and avoid the expensive computation of the potential 16 permutations
-
-            pixels 1 and 9 are examined, if both I1 and I9 are within [Ip - t, Ip + t], then candidate p is not a corner.
-        */
-        const uint8_t p1 = (in_offset + in_row[0])[3];
-        const uint8_t p9 = (in_offset + in_row[6])[3];
-
-        if(!is_rejected(p1, p9, a, b))
-        {
-            /* pixels 5 and 13 are further examined to check whether three of them are brighter than Ip + t or darker than Ip - t */
-            const uint8_t p5  = (in_offset + in_row[3])[6];
-            const uint8_t p13 = (in_offset + in_row[3])[0];
-
-            if(!is_rejected(p5, p13, a, b))
-            {
-                /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */
-                const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r);
-
-                if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index))
-                {
-                    if(_non_max_suppression)
-                    {
-                        score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index);
-                    }
-                    else
-                    {
-                        score = 1;
-                    }
-                }
-            }
-        }
-
-        *out.ptr() = score;
-    },
-    in, out);
-}
diff --git a/src/core/NEON/kernels/NEFastCornersKernel.h b/src/core/NEON/kernels/NEFastCornersKernel.h
deleted file mode 100644
index f981d72a03..0000000000
--- a/src/core/NEON/kernels/NEFastCornersKernel.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEFASTCORNERSKERNEL_H
-#define ARM_COMPUTE_NEFASTCORNERSKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Neon kernel to perform fast corners */
-class NEFastCornersKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEFastCornersKernel";
-    }
-    /** Constructor */
-    NEFastCornersKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFastCornersKernel(const NEFastCornersKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEFastCornersKernel(NEFastCornersKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default;
-    /** Default destructor */
-    ~NEFastCornersKernel() = default;
-    /** Initialise the kernel.
-     *
-     * @param[in]  input               Source image. Data type supported: U8.
-     * @param[out] output              Output image. Data type supported: U8.
-     * @param[in]  threshold           Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3.
-     * @param[in]  non_max_suppression True if non-maxima suppresion is applied, false otherwise.
-     * @param[in]  border_undefined    True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const IImage *_input;               /**< source image */
-    IImage       *_output;              /**< inermediate results */
-    uint8_t       _threshold;           /**< threshold on difference between intensity */
-    bool          _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEFASTCORNERSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
deleted file mode 100644
index 63b26ab7c0..0000000000
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGaussian3x3Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-BorderSize NEGaussian3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEGaussian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEGaussian3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    const uint8_t *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1));
-    const uint8_t *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
-    const uint8_t *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1));
-
-    static const int16x8_t two  = vdupq_n_s16(2);
-    static const int16x8_t four = vdupq_n_s16(4);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-        const int16x8x2_t top_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-            }
-        };
-        const int16x8x2_t mid_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
-            }
-        };
-        const int16x8x2_t bot_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-            }
-        };
-
-        //top left
-        int16x8_t out = top_s16.val[0];
-        //top mid
-        out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), two);
-        //top right
-        out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
-        //mid left
-        out = vmlaq_s16(out, mid_s16.val[0], two);
-        //mid mid
-        out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1), four);
-        //mid right
-        out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
-        //bot left
-        out = vaddq_s16(out, bot_s16.val[0]);
-        //bot mid
-        out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
-        //bot right
-        out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
-
-        vst1_u8(output.ptr(), vqshrun_n_s16(out, 4));
-    },
-    input, output);
-}
diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.h b/src/core/NEON/kernels/NEGaussian3x3Kernel.h
deleted file mode 100644
index 7ceea2e7c1..0000000000
--- a/src/core/NEON/kernels/NEGaussian3x3Kernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
-#define ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform a Gaussian 3x3 filter */
-class NEGaussian3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian3x3Kernel";
-    }
-    /** Constructor */
-    NEGaussian3x3Kernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussian3x3Kernel(const NEGaussian3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussian3x3Kernel &operator=(const NEGaussian3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussian3x3Kernel(NEGaussian3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussian3x3Kernel &operator=(NEGaussian3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NEGaussian3x3Kernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: same as @p input
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
deleted file mode 100644
index ab2feb0dc2..0000000000
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGaussian5x5Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-NEGaussian5x5HorKernel::NEGaussian5x5HorKernel()
-    : _border_size(0)
-{
-}
-
-BorderSize NEGaussian5x5HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEGaussian5x5HorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-
-    _input       = input;
-    _output      = output;
-    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEGaussian5x5HorKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output(_output, window);
-
-    static const int16x8_t six  = vdupq_n_s16(6);
-    static const int16x8_t four = vdupq_n_s16(4);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8x16_t data = vld1q_u8(input.ptr());
-
-        const int16x8x2_t data_s16 =
-        {
-            {
-                vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-            }
-        };
-
-        int16x8_t out = vaddq_s16(data_s16.val[0], vextq_s16(data_s16.val[0], data_s16.val[1], 4));
-        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
-        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
-        out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(output.ptr()), out);
-    },
-    input, output);
-}
-
-NEGaussian5x5VertKernel::NEGaussian5x5VertKernel()
-{
-}
-
-BorderSize NEGaussian5x5VertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 32;
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 5;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEGaussian5x5VertKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    const uint8_t *input_top2_ptr = _input->ptr_to_element(Coordinates(0, -2));
-    const uint8_t *input_top_ptr  = _input->ptr_to_element(Coordinates(0, -1));
-    const uint8_t *input_mid_ptr  = _input->ptr_to_element(Coordinates(0, 0));
-    const uint8_t *input_low_ptr  = _input->ptr_to_element(Coordinates(0, 1));
-    const uint8_t *input_low2_ptr = _input->ptr_to_element(Coordinates(0, 2));
-
-    const uint16x8_t six  = vdupq_n_u16(6);
-    const uint16x8_t four = vdupq_n_u16(4);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const size_t input_offset_high_s16 = input.offset();
-        const size_t input_offset_low_s16  = input.offset() + 16;
-
-        //HIGH DATA
-        //top2
-        uint16x8_t data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_high_s16)));
-        uint16x8_t out_high  = data_high;
-        //top
-        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_high_s16)));
-        out_high  = vmlaq_u16(out_high, data_high, four);
-        //mid
-        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_high_s16)));
-        out_high  = vmlaq_u16(out_high, data_high, six);
-        //low
-        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_high_s16)));
-        out_high  = vmlaq_u16(out_high, data_high, four);
-        //low2
-        data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_high_s16)));
-        out_high  = vaddq_u16(out_high, data_high);
-
-        //LOW DATA
-        //top2
-        uint16x8_t data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + input_offset_low_s16)));
-        uint16x8_t out_low  = data_low;
-        //top
-        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + input_offset_low_s16)));
-        out_low  = vmlaq_u16(out_low, data_low, four);
-        //mid
-        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + input_offset_low_s16)));
-        out_low  = vmlaq_u16(out_low, data_low, six);
-        //low
-        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + input_offset_low_s16)));
-        out_low  = vmlaq_u16(out_low, data_low, four);
-        //low2
-        data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + input_offset_low_s16)));
-        out_low  = vaddq_u16(out_low, data_low);
-
-        vst1q_u8(output.ptr(), vcombine_u8(vqshrn_n_u16(out_high, 8),
-                                           vqshrn_n_u16(out_low, 8)));
-    },
-    input, output);
-}
diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.h b/src/core/NEON/kernels/NEGaussian5x5Kernel.h
deleted file mode 100644
index 2c7262f827..0000000000
--- a/src/core/NEON/kernels/NEGaussian5x5Kernel.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
-#define ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform a Gaussian 5x5 filter (horizontal pass) */
-class NEGaussian5x5HorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian5x5HorKernel";
-    }
-    /** Default constructor */
-    NEGaussian5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussian5x5HorKernel(NEGaussian5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussian5x5HorKernel &operator=(NEGaussian5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussian5x5HorKernel(NEGaussian5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussian5x5HorKernel &operator=(NEGaussian5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussian5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output           Destination tensor. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    BorderSize _border_size;
-};
-
-/** Neon kernel to perform a Gaussian 5x5 filter (vertical pass) */
-class NEGaussian5x5VertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussian5x5VertKernel";
-    }
-    /** Default constructor */
-    NEGaussian5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussian5x5VertKernel(NEGaussian5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussian5x5VertKernel &operator=(NEGaussian5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussian5x5VertKernel(NEGaussian5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussian5x5VertKernel &operator=(NEGaussian5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussian5x5VertKernel() = default;
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input            Source tensor. Data type supported: S16.
-     * @param[out] output           Destination tensor, Data type supported: U8.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H */
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
deleted file mode 100644
index 49c8e9ec3e..0000000000
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEGaussianPyramidKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-
-NEGaussianPyramidHorKernel::NEGaussianPyramidHorKernel()
-    : _l2_load_offset(0)
-{
-}
-
-BorderSize NEGaussianPyramidHorKernel::border_size() const
-{
-    return BorderSize{ 0, 2 };
-}
-
-void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1));
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 32;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    const float            scale_x                           = static_cast<float>(output->info()->dimension(0)) / input->info()->dimension(0);
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x);
-
-    // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even
-    // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether
-    // a pixel is even or odd is determined based on the tensor shape not the
-    // valid region!)
-    // Thus the offset from which the first pixel (L2) for the convolution is
-    // loaded depends on the anchor and shape of the valid region.
-    // In the case of an even shape (= even image width) we need to load L2
-    // from -2 if the anchor is odd and from -1 if the anchor is even. That
-    // makes sure that L2 is always loaded from an odd pixel.
-    // On the other hand, for an odd shape (= odd image width) we need to load
-    // L2 from -1 if the anchor is odd and from -2 if the anchor is even to
-    // achieve the opposite effect.
-    // The condition can be simplified to checking whether anchor + shape is
-    // odd (-2) or even (-1) as only adding an odd and an even number will have
-    // an odd result.
-    _l2_load_offset = -border_size().left;
-
-    if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0)
-    {
-        _l2_load_offset += 1;
-    }
-
-    // Replace input access with static window
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEGaussianPyramidHorKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(window.x().step() % 2);
-
-    static const int16x8_t six  = vdupq_n_s16(6);
-    static const int16x8_t four = vdupq_n_s16(4);
-
-    Window win_in(window);
-    win_in.shift(Window::DimX, _l2_load_offset);
-
-    Iterator in(_input, win_in);
-
-    // The output is half the width of the input
-    Window win_out(window);
-    win_out.scale(Window::DimX, 0.5f);
-
-    Iterator out(_output, win_out);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16x2_t data_2q   = vld2q_u8(in.ptr());
-        const uint8x16_t &data_even = data_2q.val[0];
-        const uint8x16_t &data_odd  = data_2q.val[1];
-
-        const int16x8_t data_l2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_even)));
-        const int16x8_t data_l1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_odd)));
-        const int16x8_t data_m  = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 1))));
-        const int16x8_t data_r1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_odd, data_odd, 1))));
-        const int16x8_t data_r2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 2))));
-
-        int16x8_t out_val = vaddq_s16(data_l2, data_r2);
-        out_val           = vmlaq_s16(out_val, data_l1, four);
-        out_val           = vmlaq_s16(out_val, data_m, six);
-        out_val           = vmlaq_s16(out_val, data_r1, four);
-
-        vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()), out_val);
-    },
-    in, out);
-}
-
-NEGaussianPyramidVertKernel::NEGaussianPyramidVertKernel()
-    : _t2_load_offset(0)
-{
-}
-
-BorderSize NEGaussianPyramidVertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
-
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
-    {
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i));
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_rows_processed_per_iteration  = 2;
-
-    constexpr unsigned int num_elems_written_per_iteration = 16;
-    constexpr unsigned int num_rows_written_per_iteration  = 1;
-
-    constexpr unsigned int num_elems_read_per_iteration = 16;
-    constexpr unsigned int num_rows_read_per_iteration  = 5;
-
-    const float scale_y = static_cast<float>(output->info()->dimension(1)) / input->info()->dimension(1);
-
-    Window                win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration));
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration, 1.f, scale_y);
-
-    // Determine whether we need to load even or odd rows. See above for a
-    // detailed explanation.
-    _t2_load_offset = -border_size().top;
-
-    if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0)
-    {
-        _t2_load_offset += 1;
-    }
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEGaussianPyramidVertKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(window.x().step() != 16);
-    ARM_COMPUTE_ERROR_ON(window.y().step() % 2);
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    static const uint16x8_t six  = vdupq_n_u16(6);
-    static const uint16x8_t four = vdupq_n_u16(4);
-
-    Window win_in(window);
-    // Need to load two times 8 values instead of 16 values once
-    win_in.set_dimension_step(Window::DimX, 8);
-    win_in.shift(Window::DimY, _t2_load_offset);
-
-    Iterator in(_input, win_in);
-
-    // Output's height is half of input's
-    Window win_out(window);
-    win_out.scale(Window::DimY, 0.5f);
-
-    Iterator out(_output, win_out);
-
-    const uint8_t *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 0));
-    const uint8_t *input_top_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 1));
-    const uint8_t *input_mid_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 2));
-    const uint8_t *input_low_ptr  = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 3));
-    const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 4));
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        // Low data
-        const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
-        const uint16x8_t data_low_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
-        const uint16x8_t data_low_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
-        const uint16x8_t data_low_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
-        const uint16x8_t data_low_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
-
-        uint16x8_t out_low = vaddq_u16(data_low_t2, data_low_b2);
-        out_low            = vmlaq_u16(out_low, data_low_t1, four);
-        out_low            = vmlaq_u16(out_low, data_low_m, six);
-        out_low            = vmlaq_u16(out_low, data_low_b1, four);
-
-        in.increment(Window::DimX);
-
-        // High data
-        const uint16x8_t data_high_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top2_ptr + in.offset())));
-        const uint16x8_t data_high_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_top_ptr + in.offset())));
-        const uint16x8_t data_high_m  = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_mid_ptr + in.offset())));
-        const uint16x8_t data_high_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low_ptr + in.offset())));
-        const uint16x8_t data_high_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast<const int16_t *>(input_low2_ptr + in.offset())));
-
-        uint16x8_t out_high = vaddq_u16(data_high_t2, data_high_b2);
-        out_high            = vmlaq_u16(out_high, data_high_t1, four);
-        out_high            = vmlaq_u16(out_high, data_high_m, six);
-        out_high            = vmlaq_u16(out_high, data_high_b1, four);
-
-        vst1q_u8(out.ptr(), vcombine_u8(vqshrn_n_u16(out_low, 8), vqshrn_n_u16(out_high, 8)));
-    },
-    in, out);
-}
diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.h b/src/core/NEON/kernels/NEGaussianPyramidKernel.h
deleted file mode 100644
index d943990535..0000000000
--- a/src/core/NEON/kernels/NEGaussianPyramidKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
-#define ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform a GaussianPyramid (horizontal pass) */
-class NEGaussianPyramidHorKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussianPyramidHorKernel";
-    }
-    /** Default constructor */
-    NEGaussianPyramidHorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussianPyramidHorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[out] output Destination tensor. Output should have half the input width. Data type supported: S16.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    int _l2_load_offset;
-};
-
-/** Neon kernel to perform a GaussianPyramid (vertical pass) */
-class NEGaussianPyramidVertKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEGaussianPyramidVertKernel";
-    }
-    /** Default constructor */
-    NEGaussianPyramidVertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default;
-    /** Default destructor */
-    ~NEGaussianPyramidVertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: S16.
-     * @param[out] output Destination tensor. Output should have half the input height. Data type supported: U8.
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    int _t2_load_offset;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
deleted file mode 100644
index 089cd34e0c..0000000000
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp
+++ /dev/null
@@ -1,806 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEHOGDescriptorKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstring>
-
-using namespace arm_compute;
-
-namespace
-{
-void cell_width_lt8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr,
-                    size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale)
-{
-    const float32x4_t        scale_f32    = vdupq_n_f32(phase_scale);
-    static const float32x4_t one_f32      = vdupq_n_f32(1.0f);
-    static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
-    static const int32x4_t   zero_s32     = vdupq_n_s32(0);
-    static const int32x4_t   one_s32      = vdupq_n_s32(1);
-    const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
-
-    memset(output_ptr, 0, sizeof(float) * num_bins);
-
-    for(size_t yc = 0; yc < cell_height; ++yc)
-    {
-        int32_t xc = 0;
-
-        for(; xc <= static_cast<int32_t>(cell_width) - 4; xc += 4)
-        {
-            // Load magnitude and phase values
-            const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
-            const int16x4_t mag_s16  = vld1_s16(mag_row_ptr + xc + yc * mag_stride);
-
-            // Convert magnitude and phase to float
-            const float32x4_t mag_f32   = vcvtq_f32_s32(vmovl_s16(mag_s16));
-            float32x4_t       phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8))));
-
-            // Scale phase: phase * scale + 0.5f
-            phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32);
-
-            // Compute histogram index.
-            int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32);
-
-            // Compute magnitude weights (w0 and w1)
-            const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32);
-
-            // w1 = phase_f32 - hidx_f32
-            const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32);
-
-            // w0 = 1.0 - w1
-            const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32);
-
-            // Compute contribute for splitting vote
-            const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32);
-            const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32);
-
-            // Weighted vote between 2 bins
-
-            // Check if the histogram index is equal to num_bins. If so, replace the index with 0
-            uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32);
-            hidx_s32        = vbslq_s32(mask, zero_s32, hidx_s32);
-
-            // Bin 0
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0);
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1);
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2);
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3);
-
-            hidx_s32 = vaddq_s32(hidx_s32, one_s32);
-
-            // Check if the histogram index is equal to num_bins
-            mask     = vceqq_s32(hidx_s32, num_bins_s32);
-            hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32);
-
-            // Bin1
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0);
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1);
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2);
-            *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3);
-        }
-
-        for(; xc < static_cast<int32_t>(cell_width); ++xc)
-        {
-            const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
-            const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
-
-            const float w1 = phase_value - std::floor(phase_value);
-
-            // The quantised phase is the histogram index [0, num_bins - 1] - Round
-            // Check limit of histogram index. If hidx == num_bins, hidx = 0
-            const auto hidx = static_cast<size_t>(phase_value) % num_bins;
-
-            // Weighted vote between 2 bins
-            *(output_ptr + hidx) += mag_value * (1.0f - w1);
-            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
-        }
-    }
-}
-
-void cell_width_ge8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
-                    size_t cell_height, size_t num_bins, float phase_scale)
-{
-    const float32x4_t        scale_f32    = vdupq_n_f32(phase_scale);
-    static const float32x4_t one_f32      = vdupq_n_f32(1.0f);
-    static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f);
-    static const int32x4_t   zero_s32     = vdupq_n_s32(0);
-    static const int32x4_t   one_s32      = vdupq_n_s32(1);
-    const int32x4_t          num_bins_s32 = vdupq_n_s32(num_bins);
-
-    memset(output_ptr, 0, sizeof(float) * num_bins);
-
-    for(size_t yc = 0; yc < cell_height; ++yc)
-    {
-        int32_t xc = 0;
-
-        for(; xc <= static_cast<int32_t>(cell_width) - 8; xc += 8)
-        {
-            // Load magnitude and phase values
-            const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride);
-            const int16x8_t mag_s16  = vld1q_s16(mag_row_ptr + xc + yc * mag_stride);
-
-            // Convert phase to U16
-            const uint16x8_t phase_u16 = vmovl_u8(phase_u8);
-
-            // Convert magnitude to float32
-            const float32x4x2_t mag_f32 =
-            {
-                {
-                    vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))),
-                    vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16)))
-                }
-            };
-
-            // Convert phase to float32
-            float32x4x2_t phase_f32 =
-            {
-                {
-                    vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))),
-                    vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16)))
-                }
-            };
-
-            // Scale phase: phase * scale + 0.5f
-            phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32);
-            phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32);
-
-            // Compute histogram index.
-            int32x4x2_t hidx_s32 =
-            {
-                {
-                    vcvtq_s32_f32(phase_f32.val[0]),
-                    vcvtq_s32_f32(phase_f32.val[1])
-                }
-            };
-
-            // Compute magnitude weights (w0 and w1)
-            const float32x4x2_t hidx_f32 =
-            {
-                {
-                    vcvtq_f32_s32(hidx_s32.val[0]),
-                    vcvtq_f32_s32(hidx_s32.val[1])
-                }
-            };
-
-            float32x4x2_t w1_f32 =
-            {
-                {
-                    vsubq_f32(phase_f32.val[0], hidx_f32.val[0]),
-                    vsubq_f32(phase_f32.val[1], hidx_f32.val[1])
-                }
-            };
-
-            float32x4x2_t w0_f32 =
-            {
-                {
-                    vsubq_f32(one_f32, w1_f32.val[0]),
-                    vsubq_f32(one_f32, w1_f32.val[1])
-                }
-            };
-
-            // Compute contribute for splitting vote
-            const float32x4x2_t mag_w0_f32 =
-            {
-                {
-                    vmulq_f32(mag_f32.val[0], w0_f32.val[0]),
-                    vmulq_f32(mag_f32.val[1], w0_f32.val[1])
-                }
-            };
-
-            const float32x4x2_t mag_w1_f32 =
-            {
-                {
-                    vmulq_f32(mag_f32.val[0], w1_f32.val[0]),
-                    vmulq_f32(mag_f32.val[1], w1_f32.val[1])
-                }
-            };
-
-            // Weighted vote between 2 bins
-
-            // Check if the histogram index is equal to num_bins
-            uint32x4x2_t mask =
-            {
-                {
-                    vceqq_s32(hidx_s32.val[0], num_bins_s32),
-                    vceqq_s32(hidx_s32.val[1], num_bins_s32)
-                }
-            };
-
-            hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
-            hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
-
-            // First bin - Low
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3);
-
-            // First bin - high
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3);
-
-            hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32);
-            hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32);
-
-            // Check if the histogram index is equal to num_bins
-            mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32);
-            mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32);
-
-            hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]);
-            hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]);
-
-            // Second bin - Low
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3);
-
-            // Second bin - high
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2);
-            *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3);
-        }
-
-        for(; xc < static_cast<int32_t>(cell_width); xc++)
-        {
-            const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f;
-            const float mag_value   = *(mag_row_ptr + xc + yc * mag_stride);
-
-            const float w1 = phase_value - std::floor(phase_value);
-
-            // The quantised phase is the histogram index [0, num_bins - 1] - Round
-            // Check limit of histogram index. If hidx == num_bins, hidx = 0
-            const size_t hidx = static_cast<size_t>(phase_value) % num_bins;
-
-            // Weighted vote between 2 bins
-            *(output_ptr + hidx) += mag_value * (1.0f - w1);
-            *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1;
-        }
-    }
-}
-
-void l2_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride,
-             size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold)
-{
-    ARM_COMPUTE_UNUSED(l2_hyst_threshold);
-
-    float       sum     = 0.0f;
-    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
-
-    // Compute L2-Norm
-    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
-    {
-        const float *const hist_ptr = input_row_ptr + yc * input_stride;
-
-        int32_t xc = 0;
-
-        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
-        {
-            const float32x4x4_t input_value =
-            {
-                {
-                    vld1q_f32(hist_ptr + xc + 0),
-                    vld1q_f32(hist_ptr + xc + 4),
-                    vld1q_f32(hist_ptr + xc + 8),
-                    vld1q_f32(hist_ptr + xc + 12)
-                }
-            };
-
-            // Compute input_value^2
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
-
-            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
-            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
-            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
-            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
-        }
-
-        // Compute left over
-        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
-        {
-            const float input_value = hist_ptr[xc];
-
-            sum += input_value * input_value;
-
-            output_ptr[xc + yc * num_bins_block_x] = input_value;
-        }
-    }
-
-    sum += vgetq_lane_f32(sum_f32, 0);
-    sum += vgetq_lane_f32(sum_f32, 1);
-    sum += vgetq_lane_f32(sum_f32, 2);
-    sum += vgetq_lane_f32(sum_f32, 3);
-
-    const float       scale     = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
-    const float32x4_t scale_f32 = vdupq_n_f32(scale);
-
-    int32_t i = 0;
-
-    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
-    {
-        float32x4x4_t input_value =
-        {
-            {
-                vld1q_f32(&output_ptr[i + 0]),
-                vld1q_f32(&output_ptr[i + 4]),
-                vld1q_f32(&output_ptr[i + 8]),
-                vld1q_f32(&output_ptr[i + 12])
-            }
-        };
-
-        // Scale input_value
-        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
-        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
-        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
-        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
-
-        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
-        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
-        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
-        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
-    }
-
-    for(; i < static_cast<int32_t>(num_bins_block); ++i)
-    {
-        output_ptr[i] *= scale;
-    }
-}
-
-void l2hys_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
-                float l2_hyst_threshold)
-{
-    float       sum     = 0.0f;
-    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
-
-    // Compute L2-Hys
-    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
-    {
-        const float *const hist_ptr = input_row_ptr + yc * input_stride;
-
-        int32_t xc = 0;
-
-        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
-        {
-            const float32x4x4_t input_value =
-            {
-                {
-                    vld1q_f32(hist_ptr + xc + 0),
-                    vld1q_f32(hist_ptr + xc + 4),
-                    vld1q_f32(hist_ptr + xc + 8),
-                    vld1q_f32(hist_ptr + xc + 12)
-                }
-            };
-
-            // Compute input_value^2
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
-            sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
-
-            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
-            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
-            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
-            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
-        }
-
-        // Compute left over
-        for(; xc < static_cast<int32_t>(num_bins_block_x); ++xc)
-        {
-            const float input_value = hist_ptr[xc];
-
-            sum += input_value * input_value;
-
-            output_ptr[xc + yc * num_bins_block_x] = input_value;
-        }
-    }
-
-    sum += vgetq_lane_f32(sum_f32, 0);
-    sum += vgetq_lane_f32(sum_f32, 1);
-    sum += vgetq_lane_f32(sum_f32, 2);
-    sum += vgetq_lane_f32(sum_f32, 3);
-
-    float             scale                 = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
-    float32x4_t       scale_f32             = vdupq_n_f32(scale);
-    const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold);
-
-    // Reset sum
-    sum_f32 = vdupq_n_f32(0.0f);
-    sum     = 0.0f;
-
-    int32_t i = 0;
-
-    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
-    {
-        float32x4x4_t input_value =
-        {
-            {
-                vld1q_f32(&output_ptr[i + 0]),
-                vld1q_f32(&output_ptr[i + 4]),
-                vld1q_f32(&output_ptr[i + 8]),
-                vld1q_f32(&output_ptr[i + 12])
-            }
-        };
-
-        // Scale input_value
-        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
-        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
-        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
-        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
-
-        // Clip input_value if over _threshold_l2hys
-        input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32);
-        input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32);
-        input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32);
-        input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32);
-
-        // Compute input_value^2
-        sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]);
-        sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]);
-        sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]);
-        sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]);
-
-        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
-        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
-        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
-        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
-    }
-
-    sum += vgetq_lane_f32(sum_f32, 0);
-    sum += vgetq_lane_f32(sum_f32, 1);
-    sum += vgetq_lane_f32(sum_f32, 2);
-    sum += vgetq_lane_f32(sum_f32, 3);
-
-    for(; i < static_cast<int32_t>(num_bins_block); ++i)
-    {
-        float input_value = output_ptr[i] * scale;
-
-        // Clip scaled input_value if over _threshold_L2hys
-        input_value = std::min(input_value, l2_hyst_threshold);
-
-        sum += input_value * input_value;
-
-        output_ptr[i] = input_value;
-    }
-
-    // We use the same constants of OpenCV
-    scale     = 1.0f / (std::sqrt(sum) + 1e-3f);
-    scale_f32 = vdupq_n_f32(scale);
-
-    // Rescale
-    i = 0;
-
-    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
-    {
-        float32x4x4_t input_value =
-        {
-            {
-                vld1q_f32(&output_ptr[i + 0]),
-                vld1q_f32(&output_ptr[i + 4]),
-                vld1q_f32(&output_ptr[i + 8]),
-                vld1q_f32(&output_ptr[i + 12])
-            }
-        };
-
-        // Scale input_value
-        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
-        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
-        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
-        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
-
-        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
-        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
-        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
-        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
-    }
-
-    for(; i < static_cast<int32_t>(num_bins_block); ++i)
-    {
-        // Store result
-        output_ptr[i] *= scale;
-    }
-}
-
-void l1_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
-             float l2_hyst_threshold)
-{
-    ARM_COMPUTE_UNUSED(l2_hyst_threshold);
-
-    float       sum     = 0.0f;
-    float32x4_t sum_f32 = vdupq_n_f32(0.0f);
-
-    // Compute L1-Norm
-    for(size_t yc = 0; yc < num_cells_per_block_height; ++yc)
-    {
-        const float *const hist_ptr = input_row_ptr + yc * input_stride;
-
-        int32_t xc = 0;
-
-        for(; xc <= static_cast<int32_t>(num_bins_block_x) - 16; xc += 16)
-        {
-            const float32x4x4_t input_value =
-            {
-                {
-                    vld1q_f32(hist_ptr + xc + 0),
-                    vld1q_f32(hist_ptr + xc + 4),
-                    vld1q_f32(hist_ptr + xc + 8),
-                    vld1q_f32(hist_ptr + xc + 12)
-                }
-            };
-
-            // Compute |input_value|
-            sum_f32 += vabsq_f32(input_value.val[0]);
-            sum_f32 += vabsq_f32(input_value.val[1]);
-            sum_f32 += vabsq_f32(input_value.val[2]);
-            sum_f32 += vabsq_f32(input_value.val[3]);
-
-            vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]);
-            vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]);
-            vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]);
-            vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]);
-        }
-
-        for(; xc < static_cast<int32_t>(num_bins_block_x); xc++)
-        {
-            const float input_value = hist_ptr[xc];
-
-            sum += std::abs(input_value);
-
-            output_ptr[xc + yc * num_bins_block_x] = input_value;
-        }
-    }
-
-    sum += vgetq_lane_f32(sum_f32, 0);
-    sum += vgetq_lane_f32(sum_f32, 1);
-    sum += vgetq_lane_f32(sum_f32, 2);
-    sum += vgetq_lane_f32(sum_f32, 3);
-
-    const float       scale     = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f);
-    const float32x4_t scale_f32 = vdupq_n_f32(scale);
-
-    int32_t i = 0;
-
-    for(; i <= static_cast<int32_t>(num_bins_block) - 16; i += 16)
-    {
-        float32x4x4_t input_value =
-        {
-            {
-                vld1q_f32(&output_ptr[i + 0]),
-                vld1q_f32(&output_ptr[i + 4]),
-                vld1q_f32(&output_ptr[i + 8]),
-                vld1q_f32(&output_ptr[i + 12])
-            }
-        };
-
-        // Scale input_value
-        input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32);
-        input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32);
-        input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32);
-        input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32);
-
-        vst1q_f32(&output_ptr[i + 0], input_value.val[0]);
-        vst1q_f32(&output_ptr[i + 4], input_value.val[1]);
-        vst1q_f32(&output_ptr[i + 8], input_value.val[2]);
-        vst1q_f32(&output_ptr[i + 12], input_value.val[3]);
-    }
-
-    for(; i < static_cast<int32_t>(num_bins_block); ++i)
-    {
-        output_ptr[i] *= scale;
-    }
-}
-} // namespace
-
-NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel()
-    : _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0)
-{
-}
-
-void NEHOGOrientationBinningKernel::configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32);
-    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX));
-    ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY));
-
-    _input_magnitude = input_magnitude;
-    _input_phase     = input_phase;
-    _output          = output;
-    _cell_width      = hog_info->cell_size().width;
-    _cell_height     = hog_info->cell_size().height;
-    _num_bins        = hog_info->num_bins();
-    _phase_scale     = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f);
-    _phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f);
-
-    if(_cell_width < 8)
-    {
-        _func = &cell_width_lt8;
-    }
-    else
-    {
-        _func = &cell_width_ge8;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    const unsigned int     num_elems_read_per_iteration      = 1;
-    const unsigned int     num_rows_read_per_iteration       = _cell_height;
-    const unsigned int     num_elems_written_per_iteration   = 1;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEHOGOrientationBinningKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    const size_t mag_stride   = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format());
-    const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format());
-
-    Window win_mag(window);
-    win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width));
-    win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height));
-
-    Window win_phase(win_mag);
-
-    Iterator mag(_input_magnitude, win_mag);
-    Iterator phase(_input_phase, win_phase);
-    Iterator out(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const auto mag_row_ptr   = reinterpret_cast<const int16_t *>(mag.ptr());
-        const auto phase_row_ptr = reinterpret_cast<const uint8_t *>(phase.ptr());
-        const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
-
-        (*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale);
-    },
-    mag, phase, out);
-}
-
-NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f)
-{
-}
-
-void NEHOGBlockNormalizationKernel::configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info)
-{
-    ARM_COMPUTE_ERROR_ON(hog_info == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
-
-    // Number of cells per block
-    const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width,
-                                     hog_info->block_size().height / hog_info->cell_size().height);
-
-    // Number of cells per block stride
-    const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width,
-                                            hog_info->block_stride().height / hog_info->cell_size().height);
-
-    _input                      = input;
-    _output                     = output;
-    _l2_hyst_threshold          = hog_info->l2_hyst_threshold();
-    _num_cells_per_block        = num_cells_per_block;
-    _num_cells_per_block_stride = num_cells_per_block_stride;
-    _num_bins                   = hog_info->num_bins();
-
-    ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height)));
-
-    switch(hog_info->normalization_type())
-    {
-        case HOGNormType::L2_NORM:
-            _func = &l2_norm;
-            break;
-        case HOGNormType::L2HYS_NORM:
-            _func = &l2hys_norm;
-            break;
-        case HOGNormType::L1_NORM:
-            _func = &l1_norm;
-            break;
-        default:
-            ARM_COMPUTE_ERROR_ON("Normalisation type not supported");
-            break;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-    const unsigned int     num_elems_read_per_iteration      = 1;
-    const unsigned int     num_rows_read_per_iteration       = _num_cells_per_block.height;
-    const unsigned int     num_elems_written_per_iteration   = 1;
-    const unsigned int     num_rows_written_per_iteration    = _num_cells_per_block.height;
-
-    // Configure kernel window
-    Window                win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NEHOGBlockNormalizationKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-    // Get number of bins per block
-    const size_t num_bins_per_block = _output->info()->num_channels();
-
-    // Number of bins on the same row of the block
-    const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins;
-
-    const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
-
-    Window win_in(window);
-    win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width);
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    // Normalises blocks
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto input_row_ptr = reinterpret_cast<const float *>(in.ptr() + id.y() * _num_cells_per_block_stride.height * _input->info()->strides_in_bytes()[Window::DimY]);
-        const auto out_row_ptr   = reinterpret_cast<float *>(out.ptr());
-
-        // Execute normalization function
-        (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold);
-    },
-    in, out);
-}
diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.h b/src/core/NEON/kernels/NEHOGDescriptorKernel.h
deleted file mode 100644
index e9cd47b099..0000000000
--- a/src/core/NEON/kernels/NEHOGDescriptorKernel.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
-#define ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H
-
-#include "arm_compute/core/IHOG.h"
-#include "arm_compute/core/Size2D.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform HOG Orientation Binning */
-class NEHOGOrientationBinningKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGOrientationBinningKernel";
-    }
-    /** Default constructor */
-    NEHOGOrientationBinningKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default;
-    /** Default destructor */
-    ~NEHOGOrientationBinningKernel() = default;
-
-    /**  Initialise the kernel's inputs, output and HOG's metadata
-     *
-     * @param[in]  input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16.
-     * @param[in]  input_phase     Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8
-     * @param[out] output          Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[in]  hog_info        HOG's metadata
-     */
-    void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised block normalization functions
-     *
-     * @param[in]  mag_row_ptr   Pointer to the first row of the cell in the magnitude tensor
-     * @param[in]  phase_row_ptr Pointer to the first row of the cell in the phase tensor
-     * @param[out] output_ptr    Pointer to the output cell of hog space tensor
-     * @param[in]  mag_stride    Stride of the magnitude tensor
-     * @param[in]  phase_stride  Stride of the phase tensor
-     * @param[in]  cell_width    Width of the cell
-     * @param[in]  cell_height   Height of the cell
-     * @param[in]  num_bins      Number of bins for each cell
-     * @param[in]  phase_scale   Scale factor to apply to the phase in order to calculate the histogram index
-     */
-    using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width,
-                               size_t cell_height, size_t num_bins, float phase_scale);
-    /** Orientation binning function to use for the particular cell width passed to configure() */
-    OrientBinFunc *_func;
-    const ITensor *_input_magnitude;
-    const ITensor *_input_phase;
-    ITensor       *_output;
-    size_t         _cell_width;
-    size_t         _cell_height;
-    size_t         _num_bins;
-    float          _phase_scale;
-};
-
-/** Neon kernel to perform HOG block normalization */
-class NEHOGBlockNormalizationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGBlockNormalizationKernel";
-    }
-    /** Default constructor */
-    NEHOGBlockNormalizationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default;
-    /** Default destructor */
-    ~NEHOGBlockNormalizationKernel() = default;
-
-    /** Initialise the kernel's input, output and HOG's metadata
-     *
-     * @param[in]  input    Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell
-     * @param[out] output   Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog_info HOG's metadata
-     */
-    void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised block normalization functions
-     *
-     * @param[in]  input_row_ptr              Pointer to the first row of the block in the input hog space tensor
-     * @param[out] output_ptr                 Pointer to the output block of the hog normalized space
-     * @param[in]  input_stride               Stride of the input hog space tensor
-     * @param[in]  num_cells_per_block_height Number of cells per block along the Y direction
-     * @param[in]  num_bins_block_x           Number of bins per block along the X direction
-     * @param[in]  num_bins_block             Number of total bins per block
-     * @param[in]  l2_hyst_threshold          Threshold to use for l2 hysteresis normalization
-     */
-    using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block,
-                               float l2_hyst_threshold);
-    /** Block normalization function to use for the particular normalization type passed to configure() */
-    BlockNormFunc *_func;
-    const ITensor *_input;
-    ITensor       *_output;
-    Size2D         _num_cells_per_block;
-    Size2D         _num_cells_per_block_stride;
-    size_t         _num_bins;
-    float          _l2_hyst_threshold;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
deleted file mode 100644
index cba1d5538a..0000000000
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEHOGDetectorKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/HOGInfo.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-
-NEHOGDetectorKernel::NEHOGDetectorKernel()
-    : _input(nullptr), _detection_windows(), _hog_descriptor(nullptr), _bias(0.0f), _threshold(0.0f), _idx_class(0), _num_bins_per_descriptor_x(0), _num_blocks_per_descriptor_y(0), _block_stride_width(0),
-      _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _max_num_detection_windows(0), _mutex()
-{
-}
-
-void NEHOGDetectorKernel::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, uint16_t idx_class)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(hog == nullptr);
-    ARM_COMPUTE_ERROR_ON(detection_windows == nullptr);
-    ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0);
-    ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0);
-
-    const Size2D &detection_window_size = hog->info()->detection_window_size();
-    const Size2D &block_size            = hog->info()->block_size();
-    const Size2D &block_stride          = hog->info()->block_stride();
-
-    _input                       = input;
-    _detection_windows           = detection_windows;
-    _threshold                   = threshold;
-    _idx_class                   = idx_class;
-    _hog_descriptor              = hog->descriptor();
-    _bias                        = _hog_descriptor[hog->info()->descriptor_size() - 1];
-    _num_bins_per_descriptor_x   = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels();
-    _num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1;
-    _block_stride_width          = block_stride.width;
-    _block_stride_height         = block_stride.height;
-    _detection_window_width      = detection_window_size.width;
-    _detection_window_height     = detection_window_size.height;
-    _max_num_detection_windows   = detection_windows->max_num_values();
-
-    ARM_COMPUTE_ERROR_ON((_num_bins_per_descriptor_x * _num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size());
-
-    // Get the number of blocks along the x and y directions of the input tensor
-    const ValidRegion &valid_region = input->info()->valid_region();
-    const size_t       num_blocks_x = valid_region.shape[0];
-    const size_t       num_blocks_y = valid_region.shape[1];
-
-    // Get the number of blocks along the x and y directions of the detection window
-    const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width;
-    const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height;
-
-    const size_t window_step_x = detection_window_stride.width / block_stride.width;
-    const size_t window_step_y = detection_window_stride.height / block_stride.height;
-
-    // Configure kernel window
-    Window win;
-    win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x) + window_step_x, window_step_x));
-    win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y) + window_step_y, window_step_y));
-
-    constexpr unsigned int num_elems_read_per_iteration = 1;
-    const unsigned int     num_rows_read_per_iteration  = _num_blocks_per_descriptor_y;
-
-    update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration));
-
-    INEKernel::configure(win);
-}
-
-void NEHOGDetectorKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_hog_descriptor == nullptr);
-
-    const size_t in_step_y = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type());
-
-    Iterator in(_input, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto *in_row_ptr = reinterpret_cast<const float *>(in.ptr());
-
-        // Init score_f32 with 0
-        float32x4_t score_f32 = vdupq_n_f32(0.0f);
-
-        // Init score with bias
-        float score = _bias;
-
-        // Compute Linear SVM
-        for(size_t yb = 0; yb < _num_blocks_per_descriptor_y; ++yb, in_row_ptr += in_step_y)
-        {
-            int32_t xb = 0;
-
-            const int32_t offset_y = yb * _num_bins_per_descriptor_x;
-
-            for(; xb < static_cast<int32_t>(_num_bins_per_descriptor_x) - 16; xb += 16)
-            {
-                // Load descriptor values
-                const float32x4x4_t a_f32 =
-                {
-                    {
-                        vld1q_f32(&in_row_ptr[xb + 0]),
-                        vld1q_f32(&in_row_ptr[xb + 4]),
-                        vld1q_f32(&in_row_ptr[xb + 8]),
-                        vld1q_f32(&in_row_ptr[xb + 12])
-                    }
-                };
-
-                // Load detector values
-                const float32x4x4_t b_f32 =
-                {
-                    {
-                        vld1q_f32(&_hog_descriptor[xb + 0 + offset_y]),
-                        vld1q_f32(&_hog_descriptor[xb + 4 + offset_y]),
-                        vld1q_f32(&_hog_descriptor[xb + 8 + offset_y]),
-                        vld1q_f32(&_hog_descriptor[xb + 12 + offset_y])
-                    }
-                };
-
-                // Multiply accumulate
-                score_f32 = vmlaq_f32(score_f32, a_f32.val[0], b_f32.val[0]);
-                score_f32 = vmlaq_f32(score_f32, a_f32.val[1], b_f32.val[1]);
-                score_f32 = vmlaq_f32(score_f32, a_f32.val[2], b_f32.val[2]);
-                score_f32 = vmlaq_f32(score_f32, a_f32.val[3], b_f32.val[3]);
-            }
-
-            for(; xb < static_cast<int32_t>(_num_bins_per_descriptor_x); ++xb)
-            {
-                const float a = in_row_ptr[xb];
-                const float b = _hog_descriptor[xb + offset_y];
-
-                score += a * b;
-            }
-        }
-
-        score += vgetq_lane_f32(score_f32, 0);
-        score += vgetq_lane_f32(score_f32, 1);
-        score += vgetq_lane_f32(score_f32, 2);
-        score += vgetq_lane_f32(score_f32, 3);
-
-        if(score > _threshold)
-        {
-            if(_detection_windows->num_values() < _max_num_detection_windows)
-            {
-                DetectionWindow win;
-                win.x         = (id.x() * _block_stride_width);
-                win.y         = (id.y() * _block_stride_height);
-                win.width     = _detection_window_width;
-                win.height    = _detection_window_height;
-                win.idx_class = _idx_class;
-                win.score     = score;
-
-                arm_compute::unique_lock<arm_compute::Mutex> lock(_mutex);
-                _detection_windows->push_back(win);
-                lock.unlock();
-            }
-        }
-    },
-    in);
-}
diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.h b/src/core/NEON/kernels/NEHOGDetectorKernel.h
deleted file mode 100644
index e4c699fbfb..0000000000
--- a/src/core/NEON/kernels/NEHOGDetectorKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHOGDETECTORKERNEL_H
-#define ARM_COMPUTE_NEHOGDETECTORKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "arm_compute/core/IHOG.h"
-#include "src/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform HOG detector kernel using linear SVM */
-class NEHOGDetectorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHOGDetectorKernel";
-    }
-    /** Default constructor */
-    NEHOGDetectorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHOGDetectorKernel(NEHOGDetectorKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = delete;
-    /** Default destructor */
-    ~NEHOGDetectorKernel() = default;
-
-    /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect
-     *
-     * @param[in]  input                   Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block
-     * @param[in]  hog                     HOG data object used by @ref NEHOGOrientationBinningKernel and  @ref NEHOGBlockNormalizationKernel
-     * @param[out] detection_windows       Array of @ref DetectionWindow. This array stores all the detected objects
-     * @param[in]  detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions.
-     *                                     It must be multiple of the hog->info()->block_stride()
-     * @param[in]  threshold               (Optional) Threshold for the distance between features and SVM classifying plane
-     * @param[in]  idx_class               (Optional) Index of the class used for evaluating which class the detection window belongs to
-     */
-    void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    const ITensor         *_input;
-    IDetectionWindowArray *_detection_windows;
-    const float           *_hog_descriptor;
-    float                  _bias;
-    float                  _threshold;
-    uint16_t               _idx_class;
-    size_t                 _num_bins_per_descriptor_x;
-    size_t                 _num_blocks_per_descriptor_y;
-    size_t                 _block_stride_width;
-    size_t                 _block_stride_height;
-    size_t                 _detection_window_width;
-    size_t                 _detection_window_height;
-    size_t                 _max_num_detection_windows;
-    arm_compute::Mutex     _mutex;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHOGDETECTORKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
deleted file mode 100644
index 4159e434b2..0000000000
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp
+++ /dev/null
@@ -1,817 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEHarrisCornersKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cmath>
-#include <cstddef>
-
-using namespace arm_compute;
-
-template class arm_compute::NEHarrisScoreKernel<3>;
-template class arm_compute::NEHarrisScoreKernel<5>;
-template class arm_compute::NEHarrisScoreKernel<7>;
-template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel();
-template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel();
-template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel();
-
-namespace
-{
-inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh)
-{
-    // Trace^2
-    float32x4_t trace2 = vaddq_f32(gx2, gy2);
-    trace2             = vmulq_f32(trace2, trace2);
-
-    // Det(A)
-    float32x4_t det = vmulq_f32(gx2, gy2);
-    det             = vmlsq_f32(det, gxgy, gxgy);
-
-    // Det(A) - sensitivity * trace^2
-    const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2);
-
-    // mc > strength_thresh
-    const uint32x4_t mask = vcgtq_f32(mc, strength_thresh);
-
-    return vbslq_f32(mask, mc, vdupq_n_f32(0.0f));
-}
-
-inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
-                                              float32x4_t norm_factor)
-{
-    // Normalize
-    low_gx  = vmulq_f32(low_gx, norm_factor);
-    low_gy  = vmulq_f32(low_gy, norm_factor);
-    high_gx = vmulq_f32(high_gx, norm_factor);
-    high_gy = vmulq_f32(high_gy, norm_factor);
-
-    const float32x4_t l_gx = low_gx;
-    const float32x4_t l_gy = low_gy;
-    const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1);
-    const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1);
-    const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2);
-    const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2);
-
-    // Gx*Gx
-    gx2 = vmlaq_f32(gx2, l_gx, l_gx);
-    gx2 = vmlaq_f32(gx2, m_gx, m_gx);
-    gx2 = vmlaq_f32(gx2, r_gx, r_gx);
-
-    // Gy*Gy
-    gy2 = vmlaq_f32(gy2, l_gy, l_gy);
-    gy2 = vmlaq_f32(gy2, m_gy, m_gy);
-    gy2 = vmlaq_f32(gy2, r_gy, r_gy);
-
-    // Gx*Gy
-    gxgy = vmlaq_f32(gxgy, l_gx, l_gy);
-    gxgy = vmlaq_f32(gxgy, m_gx, m_gy);
-    gxgy = vmlaq_f32(gxgy, r_gx, r_gy);
-}
-
-inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy,
-                                              float32x4_t norm_factor)
-{
-    // Normalize
-    low_gx  = vmulq_f32(low_gx, norm_factor);
-    low_gy  = vmulq_f32(low_gy, norm_factor);
-    high_gx = vmulq_f32(high_gx, norm_factor);
-    high_gy = vmulq_f32(high_gy, norm_factor);
-
-    // L2 values
-    float32x4_t gx = low_gx;
-    float32x4_t gy = low_gy;
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // L1 values
-    gx = vextq_f32(low_gx, high_gx, 1);
-    gy = vextq_f32(low_gy, high_gy, 1);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // M values
-    gx = vextq_f32(low_gx, high_gx, 2);
-    gy = vextq_f32(low_gy, high_gy, 2);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // R1 values
-    gx = vextq_f32(low_gx, high_gx, 3);
-    gy = vextq_f32(low_gy, high_gy, 3);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // R2 values
-    gx = high_gx;
-    gy = high_gy;
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-}
-
-inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2,
-                                              float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor)
-{
-    // Normalize
-    low_gx  = vmulq_f32(low_gx, norm_factor);
-    low_gy  = vmulq_f32(low_gy, norm_factor);
-    high_gx = vmulq_f32(high_gx, norm_factor);
-    high_gy = vmulq_f32(high_gy, norm_factor);
-
-    // L3 values
-    float32x4_t gx = low_gx;
-    float32x4_t gy = low_gy;
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // L2 values
-    gx = vextq_f32(low_gx, high_gx, 1);
-    gy = vextq_f32(low_gy, high_gy, 1);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // L1 values
-    gx = vextq_f32(low_gx, high_gx, 2);
-    gy = vextq_f32(low_gy, high_gy, 2);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // M values
-    gx = vextq_f32(low_gx, high_gx, 3);
-    gy = vextq_f32(low_gy, high_gy, 3);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // R1 values
-    gx = high_gx;
-    gy = high_gy;
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // Change tmp_low and tmp_high for calculating R2 and R3 values
-    low_gx  = high_gx;
-    low_gy  = high_gy;
-    high_gx = high_gx1;
-    high_gy = high_gy1;
-
-    // Normalize
-    high_gx = vmulq_f32(high_gx, norm_factor);
-    high_gy = vmulq_f32(high_gy, norm_factor);
-
-    // R2 values
-    gx = vextq_f32(low_gx, high_gx, 1);
-    gy = vextq_f32(low_gy, high_gy, 1);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-
-    // R3 values
-    gx = vextq_f32(low_gx, high_gx, 2);
-    gy = vextq_f32(low_gy, high_gy, 2);
-
-    // Accumulate
-    gx2  = vmlaq_f32(gx2, gx, gx);
-    gy2  = vmlaq_f32(gy2, gy, gy);
-    gxgy = vmlaq_f32(gxgy, gx, gy);
-}
-
-inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
-
-{
-    const auto     gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 1;
-    const auto     gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 1;
-    const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const auto     output   = static_cast<float *__restrict>(output_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float32x4x2_t gx2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gy2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gxgy =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-
-    // Row0
-    int16x8x2_t tmp_gx =
-    {
-        {
-            vld1q_s16(gx_ptr_0 - input_stride),
-            vld1q_s16(gx_ptr_1 - input_stride)
-        }
-    };
-    int16x8x2_t tmp_gy =
-    {
-        {
-            vld1q_s16(gy_ptr_0 - input_stride),
-            vld1q_s16(gy_ptr_1 - input_stride)
-        }
-    };
-    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
-    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
-    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
-
-    float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
-    float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
-    float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
-    float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
-    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
-    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
-    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-    // Row1
-    tmp_gx.val[0] = vld1q_s16(gx_ptr_0);
-    tmp_gy.val[0] = vld1q_s16(gy_ptr_0);
-    tmp_gx.val[1] = vld1q_s16(gx_ptr_1);
-    tmp_gy.val[1] = vld1q_s16(gy_ptr_1);
-
-    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
-    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
-    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
-    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
-    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
-    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
-    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-    // Row2
-    tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride);
-    tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride);
-    tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride);
-    tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride);
-
-    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
-    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
-    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
-    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-    low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
-    low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
-    high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
-    high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-    // Calculate harris score
-    const float32x4x2_t mc =
-    {
-        {
-            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
-            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
-        }
-    };
-
-    // Store score
-    vst1q_f32(output + 0, mc.val[0]);
-    vst1q_f32(output + 4, mc.val[1]);
-}
-
-inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
-{
-    auto           gx_ptr_0        = static_cast<const int32_t *__restrict>(input1_ptr) - 1;
-    auto           gy_ptr_0        = static_cast<const int32_t *__restrict>(input2_ptr) - 1;
-    const int32_t *gx_ptr_1        = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1        = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2        = gx_ptr_0 + 8;
-    const int32_t *gy_ptr_2        = gy_ptr_0 + 8;
-    const auto     output          = static_cast<float *__restrict>(output_ptr);
-    float32x4_t    sensitivity     = vdupq_n_f32(in_sensitivity);
-    float32x4_t    norm_factor     = vdupq_n_f32(in_norm_factor);
-    float32x4_t    strength_thresh = vdupq_n_f32(in_strength_thresh);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float32x4x2_t gx2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gy2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gxgy =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-
-    // Row0
-    float32x4_t low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride));
-    float32x4_t low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride));
-    float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
-    float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride));
-    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride));
-    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride));
-    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-    // Row1
-    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
-    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
-    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
-    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
-    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
-    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
-    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-    // Row2
-    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride));
-    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride));
-    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
-    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-    low_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride));
-    low_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride));
-    high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride));
-    high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride));
-    harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-    // Calculate harris score
-    const float32x4x2_t mc =
-    {
-        {
-            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
-            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
-        }
-    };
-
-    // Store score
-    vst1q_f32(output + 0, mc.val[0]);
-    vst1q_f32(output + 4, mc.val[1]);
-}
-
-inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
-{
-    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
-    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
-    const int16_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int16_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const auto     output   = static_cast<float *__restrict>(output_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float32x4x2_t gx2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gy2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gxgy =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
-    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
-    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
-
-    for(int i = 0; i < 5; ++i)
-    {
-        const int16x8x2_t tmp_gx =
-        {
-            {
-                vld1q_s16(gx_ptr_0),
-                vld1q_s16(gx_ptr_1)
-            }
-        };
-        const int16x8x2_t tmp_gy =
-        {
-            {
-                vld1q_s16(gy_ptr_0),
-                vld1q_s16(gy_ptr_1)
-            }
-        };
-
-        float32x4_t low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0])));
-        float32x4_t low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0])));
-        float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0])));
-        float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0])));
-        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-        low_gx  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1])));
-        low_gy  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1])));
-        high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1])));
-        high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1])));
-        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += input_stride;
-        gy_ptr_0 += input_stride;
-        gx_ptr_1 += input_stride;
-        gy_ptr_1 += input_stride;
-    }
-
-    // Calculate harris score
-    const float32x4x2_t mc =
-    {
-        {
-            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
-            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
-        }
-    };
-
-    // Store score
-    vst1q_f32(output + 0, mc.val[0]);
-    vst1q_f32(output + 4, mc.val[1]);
-}
-
-inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
-
-{
-    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 2 - 2 * input_stride;
-    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 2 - 2 * input_stride;
-    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2 = gx_ptr_0 + 8;
-    const int32_t *gy_ptr_2 = gy_ptr_0 + 8;
-    const auto     output   = static_cast<float *__restrict>(output_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float32x4x2_t gx2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gy2 =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4x2_t gxgy =
-    {
-        {
-            vdupq_n_f32(0.0f),
-            vdupq_n_f32(0.0f)
-        }
-    };
-    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
-    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
-    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
-
-    for(int i = 0; i < 5; ++i)
-    {
-        const float32x4_t low_gx_0  = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
-        const float32x4_t low_gy_0  = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
-        const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
-        const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
-        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor);
-
-        const float32x4_t low_gx_1  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
-        const float32x4_t low_gy_1  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
-        const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
-        const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
-        harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += input_stride;
-        gy_ptr_0 += input_stride;
-        gx_ptr_1 += input_stride;
-        gy_ptr_1 += input_stride;
-        gx_ptr_2 += input_stride;
-        gy_ptr_2 += input_stride;
-    }
-
-    // Calculate harris score
-    const float32x4x2_t mc =
-    {
-        {
-            harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh),
-            harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh)
-        }
-    };
-
-    // Store score
-    vst1q_f32(output + 0, mc.val[0]);
-    vst1q_f32(output + 4, mc.val[1]);
-}
-
-inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
-{
-    auto           gx_ptr_0 = static_cast<const int16_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
-    auto           gy_ptr_0 = static_cast<const int16_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
-    const int16_t *gx_ptr_1 = gx_ptr_0 + 8;
-    const int16_t *gy_ptr_1 = gy_ptr_0 + 8;
-    const auto     output   = static_cast<float *__restrict>(output_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float32x4_t gx2             = vdupq_n_f32(0.0f);
-    float32x4_t gy2             = vdupq_n_f32(0.0f);
-    float32x4_t gxgy            = vdupq_n_f32(0.0f);
-    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
-    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
-    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
-
-    for(int i = 0; i < 7; ++i)
-    {
-        const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0);
-        const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0);
-        const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1);
-        const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1);
-
-        float32x4_t low_gx   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx)));
-        float32x4_t low_gy   = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy)));
-        float32x4_t high_gx  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx)));
-        float32x4_t high_gy  = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy)));
-        float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx));
-        float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy));
-        harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += input_stride;
-        gy_ptr_0 += input_stride;
-        gx_ptr_1 += input_stride;
-        gy_ptr_1 += input_stride;
-    }
-
-    // Calculate harris score
-    const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output, mc);
-}
-
-inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                          float in_norm_factor, float in_sensitivity, float in_strength_thresh)
-{
-    auto           gx_ptr_0 = static_cast<const int32_t *__restrict>(input1_ptr) - 3 - 3 * input_stride;
-    auto           gy_ptr_0 = static_cast<const int32_t *__restrict>(input2_ptr) - 3 - 3 * input_stride;
-    const int32_t *gx_ptr_1 = gx_ptr_0 + 4;
-    const int32_t *gy_ptr_1 = gy_ptr_0 + 4;
-    const int32_t *gx_ptr_2 = gx_ptr_1 + 4;
-    const int32_t *gy_ptr_2 = gy_ptr_1 + 4;
-    const auto     output   = static_cast<float *__restrict>(output_ptr);
-
-    // Gx^2, Gy^2 and Gx*Gy
-    float32x4_t gx2             = vdupq_n_f32(0.0f);
-    float32x4_t gy2             = vdupq_n_f32(0.0f);
-    float32x4_t gxgy            = vdupq_n_f32(0.0f);
-    float32x4_t sensitivity     = vdupq_n_f32(in_sensitivity);
-    float32x4_t norm_factor     = vdupq_n_f32(in_norm_factor);
-    float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh);
-
-    for(int i = 0; i < 7; ++i)
-    {
-        const float32x4_t low_gx   = vcvtq_f32_s32(vld1q_s32(gx_ptr_0));
-        const float32x4_t low_gy   = vcvtq_f32_s32(vld1q_s32(gy_ptr_0));
-        const float32x4_t high_gx  = vcvtq_f32_s32(vld1q_s32(gx_ptr_1));
-        const float32x4_t high_gy  = vcvtq_f32_s32(vld1q_s32(gy_ptr_1));
-        const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2));
-        const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2));
-        harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor);
-
-        // Update gx and gy pointer
-        gx_ptr_0 += input_stride;
-        gy_ptr_0 += input_stride;
-        gx_ptr_1 += input_stride;
-        gy_ptr_1 += input_stride;
-        gx_ptr_2 += input_stride;
-        gy_ptr_2 += input_stride;
-    }
-
-    // Calculate harris score
-    const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh);
-
-    // Store score
-    vst1q_f32(output, mc);
-}
-
-} // namespace
-
-INEHarrisScoreKernel::INEHarrisScoreKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size()
-{
-}
-
-template <int32_t block_size>
-NEHarrisScoreKernel<block_size>::NEHarrisScoreKernel()
-    : INEHarrisScoreKernel(), _func(nullptr)
-{
-}
-
-template <int32_t block_size>
-void NEHarrisScoreKernel<block_size>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    Iterator input1(_input1, window);
-    Iterator input2(_input2, window);
-    Iterator output(_output, window);
-
-    const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type());
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh);
-    },
-    input1, input2, output);
-}
-
-template <int32_t block_size>
-BorderSize        NEHarrisScoreKernel<block_size>::border_size() const
-{
-    return _border_size;
-}
-
-template <int32_t block_size>
-void NEHarrisScoreKernel<block_size>::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity,
-                                                bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2);
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
-    ARM_COMPUTE_ERROR_ON(0.0f == norm_factor);
-
-    _input1          = input1;
-    _input2          = input2;
-    _output          = output;
-    _sensitivity     = sensitivity;
-    _strength_thresh = strength_thresh;
-    _norm_factor     = norm_factor;
-    _border_size     = BorderSize(block_size / 2);
-
-    if(input1->info()->data_type() == DataType::S16)
-    {
-        switch(block_size)
-        {
-            case 3:
-                _func = &harris_score3x3_S16_S16_FLOAT;
-                break;
-            case 5:
-                _func = &harris_score5x5_S16_S16_FLOAT;
-                break;
-            case 7:
-                _func = &harris_score7x7_S16_S16_FLOAT;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid block size");
-                break;
-        }
-    }
-    else
-    {
-        switch(block_size)
-        {
-            case 3:
-                _func = &harris_score3x3_S32_S32_FLOAT;
-                break;
-            case 5:
-                _func = &harris_score5x5_S32_S32_FLOAT;
-                break;
-            case 7:
-                _func = &harris_score7x7_S32_S32_FLOAT;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Invalid block size");
-                break;
-        }
-    }
-
-    ARM_COMPUTE_ERROR_ON(nullptr == _func);
-
-    constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4;
-    constexpr unsigned int num_elems_read_per_iteration      = block_size != 7 ? 16 : 12;
-    constexpr unsigned int num_elems_written_per_iteration   = block_size != 7 ? 8 : 4;
-    constexpr unsigned int num_rows_read_per_iteration       = block_size;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(),
-                                                       input2->info()->valid_region());
-
-    output_access.set_valid_region(win, valid_region, border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.h b/src/core/NEON/kernels/NEHarrisCornersKernel.h
deleted file mode 100644
index 85f80878cc..0000000000
--- a/src/core/NEON/kernels/NEHarrisCornersKernel.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
-#define ARM_COMPUTE_NEHARRISCORNERSKERNEL_H
-
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-#include "arm_compute/core/IArray.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Common interface for all Harris Score kernels */
-class INEHarrisScoreKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    INEHarrisScoreKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default;
-    /** Default destructor */
-    ~INEHarrisScoreKernel() = default;
-
-public:
-    /** Setup the kernel parameters
-     *
-     * @param[in]  input1           Source image (gradient X). Data types supported: S16/S32
-     * @param[in]  input2           Source image (gradient Y). Data types supported: same as @ input1
-     * @param[out] output           Destination image (harris score). Data types supported: F32
-     * @param[in]  norm_factor      Normalization factor to use accordingly with the gradient size (Must be different from 0)
-     * @param[in]  strength_thresh  Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel).
-     * @param[in]  sensitivity      Sensitivity threshold k from the Harris-Stephens equation
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0;
-
-protected:
-    const IImage *_input1;          /**< Source image - Gx component */
-    const IImage *_input2;          /**< Source image - Gy component */
-    IImage       *_output;          /**< Source image - Harris score */
-    float         _sensitivity;     /**< Sensitivity value */
-    float         _strength_thresh; /**< Threshold value */
-    float         _norm_factor;     /**< Normalization factor */
-    BorderSize    _border_size;     /**< Border size */
-};
-
-/** Template Neon kernel to perform Harris Score.
- *  The implementation supports 3, 5, and 7 for the block_size
- */
-template <int32_t block_size>
-class NEHarrisScoreKernel : public INEHarrisScoreKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHarrisScoreKernel";
-    }
-    /** Default constructor */
-    NEHarrisScoreKernel();
-    // Inherited methods overridden:
-    void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override;
-    BorderSize border_size() const override;
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised harris score functions */
-    using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride,
-                                     float norm_factor, float sensitivity, float strength_thresh);
-    /** Harris Score function to use for the particular image types passed to configure() */
-    HarrisScoreFunction *_func;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEHARRISCORNERSKERNEL_H */
diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp
deleted file mode 100644
index eddc3b29ab..0000000000
--- a/src/core/NEON/kernels/NEHistogramKernel.cpp
+++ /dev/null
@@ -1,249 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEHistogramKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IDistribution1D.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <array>
-
-namespace arm_compute
-{
-class Coordinates;
-
-inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins)
-{
-    arm_compute::lock_guard<arm_compute::Mutex> lock(_hist_mtx);
-
-    const unsigned int v_end = (bins / 4) * 4;
-
-    for(unsigned int b = 0; b < v_end; b += 4)
-    {
-        const uint32x4_t tmp_global = vld1q_u32(global_hist + b);
-        const uint32x4_t tmp_local  = vld1q_u32(local_hist + b);
-        vst1q_u32(global_hist + b, vaddq_u32(tmp_global, tmp_local));
-    }
-
-    for(unsigned int b = v_end; b < bins; ++b)
-    {
-        global_hist[b] += local_hist[b];
-    }
-}
-
-NEHistogramKernel::NEHistogramKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx()
-{
-}
-
-void NEHistogramKernel::histogram_U8(Window win, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-
-    const size_t          bins       = _output->num_bins();
-    const int32_t         offset     = _output->offset();
-    const uint32_t        offrange   = offset + _output->range();
-    const uint32_t *const w_lut      = _window_lut;
-    uint32_t *const       local_hist = _local_hist + info.thread_id * bins;
-
-    // Clear local_histogram
-    std::fill_n(local_hist, bins, 0);
-
-    auto update_local_hist = [&](uint8_t p)
-    {
-        if(offset <= p && p < offrange)
-        {
-            ++local_hist[w_lut[p]];
-        }
-    };
-
-    const int x_start = win.x().start();
-    const int x_end   = win.x().end();
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over
-    // pixels
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win);
-
-    // Calculate local histogram
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = x_start;
-
-        // Vector loop
-        for(; x <= x_end - 8; x += 8)
-        {
-            const uint8x8_t pixels = vld1_u8(input.ptr() + x);
-
-            update_local_hist(vget_lane_u8(pixels, 0));
-            update_local_hist(vget_lane_u8(pixels, 1));
-            update_local_hist(vget_lane_u8(pixels, 2));
-            update_local_hist(vget_lane_u8(pixels, 3));
-            update_local_hist(vget_lane_u8(pixels, 4));
-            update_local_hist(vget_lane_u8(pixels, 5));
-            update_local_hist(vget_lane_u8(pixels, 6));
-            update_local_hist(vget_lane_u8(pixels, 7));
-        }
-
-        // Process leftover pixels
-        for(; x < x_end; ++x)
-        {
-            update_local_hist(input.ptr()[x]);
-        }
-    },
-    input);
-
-    // Merge histograms
-    merge_histogram(_output->buffer(), local_hist, bins);
-}
-
-void NEHistogramKernel::histogram_fixed_U8(Window win, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr);
-
-    std::array<uint32_t, _max_range_size> local_hist{ { 0 } };
-
-    const int x_start = win.x().start();
-    const int x_end   = win.x().end();
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over
-    // pixels
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win);
-
-    // Calculate local histogram
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = x_start;
-
-        // Vector loop
-        for(; x <= x_end - 8; x += 8)
-        {
-            const uint8x8_t pixels = vld1_u8(input.ptr() + x);
-
-            ++local_hist[vget_lane_u8(pixels, 0)];
-            ++local_hist[vget_lane_u8(pixels, 1)];
-            ++local_hist[vget_lane_u8(pixels, 2)];
-            ++local_hist[vget_lane_u8(pixels, 3)];
-            ++local_hist[vget_lane_u8(pixels, 4)];
-            ++local_hist[vget_lane_u8(pixels, 5)];
-            ++local_hist[vget_lane_u8(pixels, 6)];
-            ++local_hist[vget_lane_u8(pixels, 7)];
-        }
-
-        // Process leftover pixels
-        for(; x < x_end; ++x)
-        {
-            ++local_hist[input.ptr()[x]];
-        }
-    },
-    input);
-
-    // Merge histograms
-    merge_histogram(_output->buffer(), local_hist.data(), _max_range_size);
-}
-
-void NEHistogramKernel::calculate_window_lut() const
-{
-    const int32_t  offset = _output->offset();
-    const size_t   bins   = _output->num_bins();
-    const uint32_t range  = _output->range();
-
-    std::fill_n(_window_lut, offset, 0);
-
-    for(unsigned int p = offset; p < _max_range_size; ++p)
-    {
-        _window_lut[p] = ((p - offset) * bins) / range;
-    }
-}
-
-void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == local_hist);
-    ARM_COMPUTE_ERROR_ON(nullptr == window_lut);
-
-    _input      = input;
-    _output     = output;
-    _local_hist = local_hist;
-    _window_lut = window_lut;
-
-    //Check offset
-    ARM_COMPUTE_ERROR_ON_MSG(0 > _output->offset() || _output->offset() > static_cast<int32_t>(_max_range_size), "Offset is larger than the image value range.");
-
-    //Check range
-    ARM_COMPUTE_ERROR_ON_MSG(static_cast<int32_t>(_output->range()) > static_cast<int32_t>(_max_range_size) /* max range */, "Range larger than the image value range.");
-
-    // Calculate LUT
-    calculate_window_lut();
-
-    // Set appropriate function
-    _func = &NEHistogramKernel::histogram_U8;
-
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    INEKernel::configure(win);
-}
-
-void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-
-    _input  = input;
-    _output = output;
-
-    // Set appropriate function
-    _func = &NEHistogramKernel::histogram_fixed_U8;
-
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    INEKernel::configure(win);
-}
-
-void NEHistogramKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window, info);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEHistogramKernel.h b/src/core/NEON/kernels/NEHistogramKernel.h
deleted file mode 100644
index e14519ce25..0000000000
--- a/src/core/NEON/kernels/NEHistogramKernel.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEHISTOGRAMKERNEL_H
-#define ARM_COMPUTE_NEHISTOGRAMKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-class IDistribution1D;
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the histogram kernel */
-class NEHistogramKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEHistogramKernel";
-    }
-    /** Default constructor */
-    NEHistogramKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramKernel(const NEHistogramKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEHistogramKernel &operator=(const NEHistogramKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHistogramKernel(NEHistogramKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEHistogramKernel &operator=(NEHistogramKernel &&) = delete;
-    /** Default destructor */
-    ~NEHistogramKernel() = default;
-
-    /** Set the input image and the distribution output.
-     *
-     * @param[in]     input      Source image. Data type supported: U8.
-     * @param[out]    output     Destination distribution.
-     * @param[in,out] local_hist Array that the threads use to save their local histograms.
-     *                           It's size should be equal to (number_of_threads * num_bins),
-     *                           and the Window::thread_id() is used to determine the part of the array
-     *                           used by each thread.
-     * @param[out]    window_lut LUT with pre-calculated possible window values.
-     *                           The size of the LUT should be equal to max_range_size and it will be filled
-     *                           during the configure stage, while it re-used in every run, therefore can be
-     *                           safely shared among threads.
-     */
-    void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut);
-    /** Set the input image and the distribution output.
-     *
-     * @note Used for histogram of fixed size equal to 256
-     *
-     * @param[in]  input  Source image. Data type supported: U8.
-     * @param[out] output Destination distribution which must be of 256 bins..
-     */
-    void configure(const IImage *input, IDistribution1D *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to merge multiple partial histograms.
-     *
-     * @param[out] global_hist Pointer to the final histogram.
-     * @param[in]  local_hist  Pointer to the partial histograms.
-     * @param[in]  bins        Number of bins.
-     */
-    void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins);
-    /** Function to merge multiple minimum values of partial histograms.
-     *
-     * @param[out] global_min Pointer to the global min value.
-     * @param[in]  local_min  Local min value.
-     */
-    void merge_min(uint8_t *global_min, const uint8_t &local_min);
-    /** Function to perform histogram on the given window
-     *
-     * @param[in] win  Region on which to execute the kernel
-     * @param[in] info Info about the executing thread
-     */
-    void histogram_U8(Window win, const ThreadInfo &info);
-    /** Function to perform histogram on the given window where histogram is
-     *         of fixed size 256 without ranges and offsets.
-     *
-     * @param[in] win  Region on which to execute the kernel
-     * @param[in] info Info about the executing thread
-     */
-    void histogram_fixed_U8(Window win, const ThreadInfo &info);
-    /** Pre-calculate the pixel windowing for every possible pixel
-     *
-     * Calculate (V - offset) * numBins / range where V is every possible pixel value.
-     *
-     * @note We currently support U8 image thus possible pixel values are between 0 and 255
-     */
-    void calculate_window_lut() const;
-    /** Common signature for all the specialised Histogram functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window, const ThreadInfo &info);
-
-    HistogramFunctionPtr          _func; ///< Histogram function to use for the particular image types passed to configure()
-    const IImage                 *_input;
-    IDistribution1D              *_output;
-    uint32_t                     *_local_hist;
-    uint32_t                     *_window_lut;
-    arm_compute::Mutex            _hist_mtx;
-    static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEHISTOGRAMKERNEL_H */
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp
deleted file mode 100644
index 6ee97eea30..0000000000
--- a/src/core/NEON/kernels/NEIntegralImageKernel.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEIntegralImageKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-void NEIntegralImageKernel::configure(const ITensor *input, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32);
-
-    _input  = input;
-    _output = output;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-    // The kernel is effectively reading 17 values from -1 as it loads 16
-    // starting at -1 and also 16 starting at 0
-    AccessWindowRectangle  output_read_access(output->info(), -1, -1, num_elems_processed_per_iteration + 1, 1);
-    AccessWindowHorizontal output_write_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output_read_access, output_write_access);
-
-    output_write_access.set_valid_region(win, input->info()->valid_region());
-
-    IKernel::configure(win);
-}
-
-BorderSize NEIntegralImageKernel::border_size() const
-{
-    return BorderSize{ 1, 0, 0, 1 };
-}
-
-bool NEIntegralImageKernel::is_parallelisable() const
-{
-    return false;
-}
-
-void NEIntegralImageKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    const auto output_top_left = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(-1, -1)));
-    const auto output_top_mid  = reinterpret_cast<const uint32_t *>(_output->ptr_to_element(Coordinates(0, -1)));
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t input_pixels = vld1q_u8(input.ptr());
-
-        const uint16x8x2_t tmp =
-        {
-            {
-                vmovl_u8(vget_low_u8(input_pixels)),
-                vmovl_u8(vget_high_u8(input_pixels))
-            }
-        };
-
-        uint32x4x4_t pixels =
-        {
-            {
-                vmovl_u16(vget_low_u16(tmp.val[0])),
-                vmovl_u16(vget_high_u16(tmp.val[0])),
-                vmovl_u16(vget_low_u16(tmp.val[1])),
-                vmovl_u16(vget_high_u16(tmp.val[1]))
-            }
-        };
-
-        // Divide by four as pointer is now uint32 instead of uint8!
-        const size_t off = output.offset() / 4;
-
-        // Add top mid pixel values
-        const uint32_t *const top_mid_ptr = output_top_mid + off;
-
-        pixels.val[0] = vaddq_u32(vld1q_u32(top_mid_ptr), pixels.val[0]);
-        pixels.val[1] = vaddq_u32(vld1q_u32(top_mid_ptr + 4), pixels.val[1]);
-        pixels.val[2] = vaddq_u32(vld1q_u32(top_mid_ptr + 8), pixels.val[2]);
-        pixels.val[3] = vaddq_u32(vld1q_u32(top_mid_ptr + 12), pixels.val[3]);
-
-        // Subtract top left diagonal values
-        const auto            outptr       = reinterpret_cast<uint32_t *>(output.ptr());
-        const uint32_t *const top_left_ptr = output_top_left + off;
-
-        pixels.val[0] = vsubq_u32(pixels.val[0], vld1q_u32(top_left_ptr));
-        vst1q_u32(outptr, pixels.val[0]);
-
-        pixels.val[1] = vsubq_u32(pixels.val[1], vld1q_u32(top_left_ptr + 4));
-        vst1q_u32(outptr + 4, pixels.val[1]);
-
-        pixels.val[2] = vsubq_u32(pixels.val[2], vld1q_u32(top_left_ptr + 8));
-        vst1q_u32(outptr + 8, pixels.val[2]);
-
-        pixels.val[3] = vsubq_u32(pixels.val[3], vld1q_u32(top_left_ptr + 12));
-        vst1q_u32(outptr + 12, pixels.val[3]);
-
-        // Perform prefix summation
-        for(auto i = 0; i < 16; ++i)
-        {
-            outptr[i] += outptr[i - 1];
-        }
-    },
-    input, output);
-}
diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.h b/src/core/NEON/kernels/NEIntegralImageKernel.h
deleted file mode 100644
index 8d92504317..0000000000
--- a/src/core/NEON/kernels/NEIntegralImageKernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
-#define ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform an image integral on an image */
-class NEIntegralImageKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEIntegralImageKernel";
-    }
-    /** Default constructor */
-    NEIntegralImageKernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIntegralImageKernel(const NEIntegralImageKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIntegralImageKernel &operator=(const NEIntegralImageKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEIntegralImageKernel(NEIntegralImageKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEIntegralImageKernel &operator=(NEIntegralImageKernel &&) = delete;
-    /** Default destructor */
-    ~NEIntegralImageKernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8
-     * @param[out] output Destination tensor. Data type supported: U32
-     */
-    void configure(const ITensor *input, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-    bool       is_parallelisable() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
deleted file mode 100644
index 205f67823d..0000000000
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMagnitudePhaseKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-// Defines for computing atan2
-constexpr float SCALE_FACTOR = 0.7111111111111111f;
-constexpr float PI           = 3.141592653589793f;
-constexpr float SCALE_180    = 180.0f / PI;
-constexpr float SCALE_360    = SCALE_180 * SCALE_FACTOR;
-constexpr float PI_4         = 0.7853981633974483f;
-constexpr float COEFF1       = 0.0663f;
-constexpr float COEFF2       = 0.2447f;
-} // namespace
-
-namespace
-{
-inline float32x4_t inv(float32x4_t x)
-{
-    float32x4_t result = vrecpeq_f32(x);
-    result             = vmulq_f32(vrecpsq_f32(x, result), result);
-    return result;
-}
-
-inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy)
-{
-    const float32x4_t zero       = vdupq_n_f32(0.0f);
-    const float32x4_t epsilon    = vdupq_n_f32(1e-9f);
-    const float32x4_t piover4    = vdupq_n_f32(PI_4);
-    const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
-    const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
-    const float32x4_t ninety     = vdupq_n_f32(90.0f * SCALE_FACTOR);
-    const float32x4_t oneeighty  = vdupq_n_f32(180.0f * SCALE_FACTOR);
-    const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR);
-    const float32x4_t scale      = vdupq_n_f32(SCALE_360);
-
-    float32x4_t abs_gx = vabsq_f32(gx);
-    float32x4_t abs_gy = vabsq_f32(gy);
-    float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
-    float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
-    float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
-    float32x4_t absz   = vabsq_f32(z);
-    float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
-
-    /* Compute y = pi/4 * x - x*(abs(x)-1)*(0.2447+0.0663 * abs(x) */
-    float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
-    result             = vmulq_f32(result, term);
-    result             = vmlaq_f32(result, piover4, z);
-
-    /* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255]  */
-    result = vmulq_f32(result, scale);
-
-    /* If z > 1, result = 90 - result */
-    result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
-
-    /* Choose correct quadrant */
-    result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
-    result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
-
-    return result;
-}
-
-inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy)
-{
-    const float32x4_t zero       = vdupq_n_f32(0.0f);
-    const float32x4_t epsilon    = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0
-    const float32x4_t piover4    = vdupq_n_f32(PI_4);
-    const float32x4_t coeff1     = vdupq_n_f32(COEFF1);
-    const float32x4_t coeff2     = vdupq_n_f32(COEFF2);
-    const float32x4_t ninety     = vdupq_n_f32(90.0f);
-    const float32x4_t oneeighty  = vdupq_n_f32(180.0f);
-    const float32x4_t threesixty = vdupq_n_f32(360.0f);
-    const float32x4_t scale      = vdupq_n_f32(SCALE_180);
-
-    float32x4_t abs_gx = vabsq_f32(gx);
-    float32x4_t abs_gy = vabsq_f32(gy);
-    float32x4_t tmin   = vminq_f32(abs_gx, abs_gy);
-    float32x4_t tmax   = vmaxq_f32(abs_gx, abs_gy);
-    float32x4_t z      = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon)));
-    float32x4_t absz   = vabsq_f32(z);
-
-    /* Compute y = pi/4 * z - z*(abs(z)-1)*(0.2447+0.0663 * abs(z) */
-    float32x4_t term   = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz));
-    float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1));
-    result             = vmulq_f32(result, term);
-    result             = vmlaq_f32(result, piover4, z);
-
-    /* Radians to degrees conversion */
-    result = vmulq_f32(result, scale);
-
-    /* If z > 1, result = 90 - result */
-    result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result));
-
-    /* Choose correct quadrant */
-    result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result);
-    result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result);
-    result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result);
-
-    return result;
-}
-
-inline float32x4_t invsqrtv(float32x4_t x)
-{
-    float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-
-    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
-                                sqrt_reciprocal);
-    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal),
-                                sqrt_reciprocal);
-
-    return sqrt_reciprocal;
-}
-
-inline float32x4_t sqrtv(float32x4_t x)
-{
-    float32x4_t res = vdupq_n_f32(0.5f);
-    return vmlaq_f32(res, x, invsqrtv(x));
-}
-
-inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2)
-{
-    const int32x4x2_t square_x =
-    {
-        {
-            vmull_s16(vget_low_s16(input1), vget_low_s16(input1)),
-            vmull_s16(vget_high_s16(input1), vget_high_s16(input1))
-        }
-    };
-
-    const int32x4x2_t square_y =
-    {
-        {
-            vmull_s16(vget_low_s16(input2), vget_low_s16(input2)),
-            vmull_s16(vget_high_s16(input2), vget_high_s16(input2))
-        }
-    };
-
-    const uint32x4x2_t sum =
-    {
-        {
-            vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])),
-            vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1]))
-        }
-    };
-
-    const float32x4x2_t res =
-    {
-        {
-            sqrtv(vcvtq_f32_u32(sum.val[0])),
-            sqrtv(vcvtq_f32_u32(sum.val[1]))
-        }
-    };
-
-    return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])),
-                        vqmovn_s32(vcvtq_s32_f32(res.val[1])));
-}
-
-inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2)
-{
-    /* Saturating add */
-    return vqaddq_s16(vqabsq_s16(input1), vqabsq_s16(input2));
-}
-
-inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2)
-{
-    const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
-
-    float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
-    float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
-    float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
-    float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
-
-    /* Compute fast atan2 */
-    float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high);
-    float32x4_t angle_low  = atan2_0_360(inputx_f32_low, inputy_f32_low);
-
-    angle_high = vaddq_f32(angle_high, zeropointfive);
-    angle_low  = vaddq_f32(angle_low, zeropointfive);
-
-    return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
-                                  vqmovun_s32(vcvtq_s32_f32(angle_high))));
-}
-
-inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2)
-{
-    const float32x4_t zeropointfive = vdupq_n_f32(0.5f);
-
-    float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1)));
-    float32x4_t inputx_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1)));
-    float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2)));
-    float32x4_t inputy_f32_low  = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2)));
-
-    /* Compute fast atan2 */
-    float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high);
-    float32x4_t angle_low  = atan2_0_180(inputx_f32_low, inputy_f32_low);
-
-    angle_high = vaddq_f32(angle_high, zeropointfive);
-    angle_low  = vaddq_f32(angle_low, zeropointfive);
-
-    return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)),
-                                  vqmovun_s32(vcvtq_s32_f32(angle_high))));
-}
-} // namespace
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-NEMagnitudePhaseKernel<mag_type, phase_type>::NEMagnitudePhaseKernel()
-    : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr)
-{
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16);
-    ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase));
-
-    const bool run_mag   = magnitude != nullptr;
-    const bool run_phase = phase != nullptr;
-
-    if(run_mag)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16);
-    }
-
-    if(run_phase)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8);
-    }
-
-    _gx        = gx;
-    _gy        = gy;
-    _magnitude = magnitude;
-    _phase     = phase;
-
-    if(run_mag && run_phase)
-    {
-        /* Run magnitude and phase */
-        _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase;
-    }
-    else
-    {
-        if(run_mag)
-        {
-            /* Run magnitude */
-            _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude;
-        }
-        else if(run_phase)
-        {
-            /* Run phase */
-            _func = &NEMagnitudePhaseKernel<mag_type, phase_type>::phase;
-        }
-        else
-        {
-            ARM_COMPUTE_ERROR("At least one output must be NOT NULL");
-        }
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window                 win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration));
-    AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration),
-                              magnitude_access,
-                              phase_access);
-
-    ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(),
-                                                       gy->info()->valid_region());
-
-    magnitude_access.set_valid_region(win, valid_region);
-    phase_access.set_valid_region(win, valid_region);
-
-    INEKernel::configure(win);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude(const Window &window)
-{
-    Iterator gx(_gx, window);
-    Iterator gy(_gy, window);
-    Iterator magnitude(_magnitude, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const int16x8x2_t input1 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
-                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
-            }
-        };
-
-        const int16x8x2_t input2 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
-                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
-            }
-        };
-
-        /* Compute magnitude */
-        int16x8x2_t mag{ {} };
-
-        if(MagnitudeType::L2NORM == mag_type)
-        {
-            mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
-            mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
-        }
-        else
-        {
-            mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
-            mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
-        }
-
-        /* Store magnitude */
-        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
-        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
-    },
-    gx, gy, magnitude);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::phase(const Window &window)
-{
-    Iterator gx(_gx, window);
-    Iterator gy(_gy, window);
-    Iterator phase(_phase, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const int16x8x2_t input1 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
-                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
-            }
-        };
-
-        const int16x8x2_t input2 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
-                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
-            }
-        };
-
-        /* Compute phase */
-        uint8x8x2_t vphase{ {} };
-
-        if(PhaseType::SIGNED == phase_type)
-        {
-            vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
-            vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
-        }
-        else
-        {
-            vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
-            vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
-        }
-
-        /* Store phase */
-        vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
-    },
-    gx, gy, phase);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::magnitude_phase(const Window &window)
-{
-    Iterator gx(_gx, window);
-    Iterator gy(_gy, window);
-    Iterator magnitude(_magnitude, window);
-    Iterator phase(_phase, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const int16x8x2_t input1 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr())),
-                vld1q_s16(reinterpret_cast<int16_t *>(gx.ptr()) + 8)
-            }
-        };
-
-        const int16x8x2_t input2 =
-        {
-            {
-                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr())),
-                vld1q_s16(reinterpret_cast<int16_t *>(gy.ptr()) + 8)
-            }
-        };
-
-        /* Compute magnitude */
-        int16x8x2_t mag{ {} };
-
-        if(MagnitudeType::L2NORM == mag_type)
-        {
-            mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]);
-            mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]);
-        }
-        else
-        {
-            mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]);
-            mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]);
-        }
-
-        /* Store magnitude */
-        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()), mag.val[0]);
-        vst1q_s16(reinterpret_cast<int16_t *>(magnitude.ptr()) + 8, mag.val[1]);
-
-        /* Compute phase */
-        uint8x8x2_t vphase{ {} };
-
-        if(PhaseType::SIGNED == phase_type)
-        {
-            vphase.val[0] = phase_signed(input1.val[0], input2.val[0]);
-            vphase.val[1] = phase_signed(input1.val[1], input2.val[1]);
-        }
-        else
-        {
-            vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]);
-            vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]);
-        }
-
-        /* Store phase */
-        vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1]));
-    },
-    gx, gy, magnitude, phase);
-}
-
-template <MagnitudeType mag_type, PhaseType phase_type>
-void NEMagnitudePhaseKernel<mag_type, phase_type>::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-
-template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::SIGNED>;
-template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::SIGNED>;
-template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L1NORM, PhaseType::UNSIGNED>;
-template class arm_compute::NEMagnitudePhaseKernel<MagnitudeType::L2NORM, PhaseType::UNSIGNED>;
diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.h b/src/core/NEON/kernels/NEMagnitudePhaseKernel.h
deleted file mode 100644
index 3803d05ce9..0000000000
--- a/src/core/NEON/kernels/NEMagnitudePhaseKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
-#define ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Template interface for the kernel to compute magnitude and phase */
-template <MagnitudeType mag_type, PhaseType phase_type>
-class NEMagnitudePhaseKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMagnitudePhaseKernel";
-    }
-    /** Default constructor */
-    NEMagnitudePhaseKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete;
-    /** Default move constructor */
-    NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete;
-    /** Default move assignment operator */
-    NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default;
-    /** Destructor */
-    ~NEMagnitudePhaseKernel() = default;
-
-    /** Initialise the kernel's input, output.
-     *
-     * @note At least one of out1 or out2 must be set
-     *
-     * @param[in]  gx        Gradient X tensor. Data type supported: S16.
-     * @param[in]  gy        Gradient Y tensor. Data type supported: S16.
-     * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16.
-     * @param[out] phase     (Optional) The output tensor - Phase. Data type supported: U8.
-     */
-    void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Function to perform magnitude on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void magnitude(const Window &window);
-    /** Function to perform phase on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void phase(const Window &window);
-    /** Function to perform magnitude and phase on the given window
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    void magnitude_phase(const Window &window);
-
-private:
-    /** Common signature for all the specialised MagnitudePhase functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window);
-    /** MagnitudePhase function to use for the particular formats passed to configure() */
-    MagnitudePhaseFunctionPtr _func;
-    const ITensor            *_gx;        /**< Input gradient X */
-    const ITensor            *_gy;        /**< Input gradient Y */
-    ITensor                  *_magnitude; /**< Output - Magnitude */
-    ITensor                  *_phase;     /**< Output - Phase */
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
deleted file mode 100644
index a6bb9f2ef7..0000000000
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMeanStdDevKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cmath>
-#include <tuple>
-#include <utility>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-template <bool calc_sum_squared>
-std::pair<uint64x1_t, uint64x1_t> accumulate(const Window &window, Iterator &iterator)
-{
-    uint64x1_t sum         = vdup_n_u64(0);
-    uint64x1_t sum_squared = vdup_n_u64(0);
-
-    // Calculate sum
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t in_data = vld1q_u8(iterator.ptr());
-
-        // Sum of the low and high elements of data
-        const uint16x8_t tmp0 = vaddl_u8(vget_low_u8(in_data), vget_high_u8(in_data));
-        const uint32x4_t tmp1 = vaddl_u16(vget_low_u16(tmp0), vget_high_u16(tmp0));
-        const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
-
-        // Update sum
-        sum = vpadal_u32(sum, tmp2);
-
-        if(calc_sum_squared)
-        {
-            const uint16x8_t square_data_low  = vmull_u8(vget_low_u8(in_data), vget_low_u8(in_data));
-            const uint16x8_t square_data_high = vmull_u8(vget_high_u8(in_data), vget_high_u8(in_data));
-
-            // Sum of the low and high elements of data
-            const uint32x4_t tmp0_low  = vaddl_u16(vget_low_u16(square_data_low), vget_high_u16(square_data_low));
-            const uint32x4_t tmp0_high = vaddl_u16(vget_low_u16(square_data_high), vget_high_u16(square_data_high));
-            const uint32x4_t tmp1      = vaddq_u32(tmp0_low, tmp0_high);
-            const uint32x2_t tmp2      = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
-
-            // Update sum
-            sum_squared = vpadal_u32(sum_squared, tmp2);
-        }
-    },
-    iterator);
-
-    return std::make_pair(sum, sum_squared);
-}
-} // namespace
-
-NEMeanStdDevKernel::NEMeanStdDevKernel()
-    : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx(), _border_size(0)
-{
-}
-
-BorderSize NEMeanStdDevKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NEMeanStdDevKernel::configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev, uint64_t *global_sum_squared)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON(nullptr == mean);
-    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
-    ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-
-    _input              = input;
-    _mean               = mean;
-    _stddev             = stddev;
-    _global_sum         = global_sum;
-    _global_sum_squared = global_sum_squared;
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    _border_size = BorderSize(ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration) - input->info()->dimension(0));
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-
-    INEKernel::configure(win);
-}
-
-void NEMeanStdDevKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    Iterator input(_input, window);
-
-    uint64x1_t local_sum         = vdup_n_u64(0);
-    uint64x1_t local_sum_squared = vdup_n_u64(0);
-
-    if(_stddev != nullptr)
-    {
-        std::tie(local_sum, local_sum_squared) = accumulate<true>(window, input);
-    }
-    else
-    {
-        std::tie(local_sum, local_sum_squared) = accumulate<false>(window, input);
-    }
-
-    const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1);
-
-    // Merge sum and calculate mean and stddev
-    arm_compute::unique_lock<arm_compute::Mutex> lock(_mtx);
-
-    *_global_sum += vget_lane_u64(local_sum, 0);
-
-    const float mean = *_global_sum / num_pixels;
-    *_mean           = mean;
-
-    if(_stddev != nullptr)
-    {
-        const uint64_t tmp_sum_squared = vget_lane_u64(local_sum_squared, 0);
-        *_global_sum_squared += tmp_sum_squared;
-        *_stddev = std::sqrt((*_global_sum_squared / num_pixels) - (mean * mean));
-    }
-
-    lock.unlock();
-}
diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.h b/src/core/NEON/kernels/NEMeanStdDevKernel.h
deleted file mode 100644
index e694f3824d..0000000000
--- a/src/core/NEON/kernels/NEMeanStdDevKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
-#define ARM_COMPUTE_NEMEANSTDDEVKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */
-class NEMeanStdDevKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMeanStdDevKernel";
-    }
-    /** Default constructor */
-    NEMeanStdDevKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMeanStdDevKernel(NEMeanStdDevKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = delete;
-    /** Default destructor */
-    ~NEMeanStdDevKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input              Input image. Data type supported: U8.
-     * @param[out] mean               Input average pixel value.
-     * @param[out] global_sum         Keeps global sum of pixel values.
-     * @param[out] stddev             (Optional) Output standard deviation of pixel values.
-     * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
-     */
-    void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    BorderSize border_size() const override;
-
-private:
-    const IImage      *_input;
-    float             *_mean;
-    float             *_stddev;
-    uint64_t          *_global_sum;
-    uint64_t          *_global_sum_squared;
-    arm_compute::Mutex _mtx;
-    BorderSize         _border_size;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NEMEANSTDDEVKERNEL_H */
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
deleted file mode 100644
index 0160edc650..0000000000
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMedian3x3Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <utility>
-
-using namespace arm_compute;
-
-namespace
-{
-inline void sort(uint8x8_t &a, uint8x8_t &b)
-{
-    const uint8x8_t min = vmin_u8(a, b);
-    const uint8x8_t max = vmax_u8(a, b);
-    a                   = min;
-    b                   = max;
-}
-} // namespace
-
-BorderSize NEMedian3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEMedian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined)
-{
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_access);
-
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NEMedian3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-
-    const unsigned char *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1));
-    const unsigned char *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
-    const unsigned char *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1));
-
-    Iterator input(_input, window);
-    Iterator output(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-        uint8x8_t p0 = vget_low_u8(top_data);
-        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
-        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
-        uint8x8_t p3 = vget_low_u8(mid_data);
-        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
-        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
-        uint8x8_t p6 = vget_low_u8(bot_data);
-        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
-        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
-
-        sort(p1, p2);
-        sort(p4, p5);
-        sort(p7, p8);
-
-        sort(p0, p1);
-        sort(p3, p4);
-        sort(p6, p7);
-
-        sort(p1, p2);
-        sort(p4, p5);
-        sort(p7, p8);
-
-        sort(p0, p3);
-        sort(p5, p8);
-        sort(p4, p7);
-
-        sort(p3, p6);
-        sort(p1, p4);
-        sort(p2, p5);
-
-        sort(p4, p7);
-        sort(p4, p2);
-        sort(p6, p4);
-
-        sort(p4, p2);
-
-        vst1_u8(output.ptr(), p4);
-    },
-    input, output);
-}
diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.h b/src/core/NEON/kernels/NEMedian3x3Kernel.h
deleted file mode 100644
index b9e28b3053..0000000000
--- a/src/core/NEON/kernels/NEMedian3x3Kernel.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
-#define ARM_COMPUTE_NEMEDIAN3x3KERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Kernel to perform a median filter on a tensor */
-class NEMedian3x3Kernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMedian3x3Kernel";
-    }
-    /** Default constructor */
-    NEMedian3x3Kernel() = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMedian3x3Kernel(const NEMedian3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMedian3x3Kernel &operator=(const NEMedian3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMedian3x3Kernel(NEMedian3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMedian3x3Kernel &operator=(NEMedian3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NEMedian3x3Kernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMEDIAN3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
deleted file mode 100644
index 402e6f1811..0000000000
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEMinMaxLocationKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <climits>
-#include <cstddef>
-
-namespace arm_compute
-{
-NEMinMaxKernel::NEMinMaxKernel()
-    : _func(), _input(nullptr), _min(), _max(), _mtx()
-{
-}
-
-void NEMinMaxKernel::configure(const IImage *input, void *min, void *max)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(nullptr == min);
-    ARM_COMPUTE_ERROR_ON(nullptr == max);
-
-    _input = input;
-    _min   = min;
-    _max   = max;
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::U8:
-            _func = &NEMinMaxKernel::minmax_U8;
-            break;
-        case DataType::S16:
-            _func = &NEMinMaxKernel::minmax_S16;
-            break;
-        case DataType::F32:
-            _func = &NEMinMaxKernel::minmax_F32;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-            break;
-    }
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    INEKernel::configure(win);
-}
-
-void NEMinMaxKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-
-void NEMinMaxKernel::reset()
-{
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    switch(_input->info()->data_type())
-    {
-        case DataType::U8:
-            *static_cast<int32_t *>(_min) = UCHAR_MAX;
-            *static_cast<int32_t *>(_max) = 0;
-            break;
-        case DataType::S16:
-            *static_cast<int32_t *>(_min) = SHRT_MAX;
-            *static_cast<int32_t *>(_max) = SHRT_MIN;
-            break;
-        case DataType::F32:
-            *static_cast<float *>(_min) = std::numeric_limits<float>::max();
-            *static_cast<float *>(_max) = std::numeric_limits<float>::lowest();
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-            break;
-    }
-}
-
-template <typename T>
-void NEMinMaxKernel::update_min_max(const T min, const T max)
-{
-    arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-
-    using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
-
-    auto min_ptr = static_cast<type *>(_min);
-    auto max_ptr = static_cast<type *>(_max);
-
-    if(min < *min_ptr)
-    {
-        *min_ptr = min;
-    }
-
-    if(max > *max_ptr)
-    {
-        *max_ptr = max;
-    }
-}
-
-void NEMinMaxKernel::minmax_U8(Window win)
-{
-    uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX);
-    uint8x8_t carry_max = vdup_n_u8(0);
-
-    uint8_t carry_max_scalar = 0;
-    uint8_t carry_min_scalar = UCHAR_MAX;
-
-    const int x_start = win.x().start();
-    const int x_end   = win.x().end();
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over pixels
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = x_start;
-
-        // Vector loop
-        for(; x <= x_end - 16; x += 16)
-        {
-            const uint8x16_t pixels  = vld1q_u8(input.ptr() + x);
-            const uint8x8_t  tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels));
-            const uint8x8_t  tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels));
-            carry_min                = vmin_u8(tmp_min, carry_min);
-            carry_max                = vmax_u8(tmp_max, carry_max);
-        }
-
-        // Process leftover pixels
-        for(; x < x_end; ++x)
-        {
-            const uint8_t pixel = input.ptr()[x];
-            carry_min_scalar    = std::min(pixel, carry_min_scalar);
-            carry_max_scalar    = std::max(pixel, carry_max_scalar);
-        }
-    },
-    input);
-
-    // Reduce result
-    carry_min = vpmin_u8(carry_min, carry_min);
-    carry_max = vpmax_u8(carry_max, carry_max);
-    carry_min = vpmin_u8(carry_min, carry_min);
-    carry_max = vpmax_u8(carry_max, carry_max);
-    carry_min = vpmin_u8(carry_min, carry_min);
-    carry_max = vpmax_u8(carry_max, carry_max);
-
-    // Extract max/min values
-    const uint8_t min_i = std::min(vget_lane_u8(carry_min, 0), carry_min_scalar);
-    const uint8_t max_i = std::max(vget_lane_u8(carry_max, 0), carry_max_scalar);
-
-    // Perform reduction of local min/max values
-    update_min_max(min_i, max_i);
-}
-
-void NEMinMaxKernel::minmax_S16(Window win)
-{
-    int16x4_t carry_min = vdup_n_s16(SHRT_MAX);
-    int16x4_t carry_max = vdup_n_s16(SHRT_MIN);
-
-    int16_t carry_max_scalar = SHRT_MIN;
-    int16_t carry_min_scalar = SHRT_MAX;
-
-    const int x_start = win.x().start();
-    const int x_end   = win.x().end();
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over pixels
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int        x      = x_start;
-        const auto in_ptr = reinterpret_cast<const int16_t *>(input.ptr());
-
-        // Vector loop
-        for(; x <= x_end - 16; x += 16)
-        {
-            const int16x8x2_t pixels   = vld2q_s16(in_ptr + x);
-            const int16x8_t   tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]);
-            const int16x8_t   tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]);
-            const int16x4_t   tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1));
-            const int16x4_t   tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1));
-            carry_min                  = vmin_s16(tmp_min2, carry_min);
-            carry_max                  = vmax_s16(tmp_max2, carry_max);
-        }
-
-        // Process leftover pixels
-        for(; x < x_end; ++x)
-        {
-            const int16_t pixel = in_ptr[x];
-            carry_min_scalar    = std::min(pixel, carry_min_scalar);
-            carry_max_scalar    = std::max(pixel, carry_max_scalar);
-        }
-
-    },
-    input);
-
-    // Reduce result
-    carry_min = vpmin_s16(carry_min, carry_min);
-    carry_max = vpmax_s16(carry_max, carry_max);
-    carry_min = vpmin_s16(carry_min, carry_min);
-    carry_max = vpmax_s16(carry_max, carry_max);
-
-    // Extract max/min values
-    const int16_t min_i = std::min(vget_lane_s16(carry_min, 0), carry_min_scalar);
-    const int16_t max_i = std::max(vget_lane_s16(carry_max, 0), carry_max_scalar);
-
-    // Perform reduction of local min/max values
-    update_min_max(min_i, max_i);
-}
-
-void NEMinMaxKernel::minmax_F32(Window win)
-{
-    float32x2_t carry_min = vdup_n_f32(std::numeric_limits<float>::max());
-    float32x2_t carry_max = vdup_n_f32(std::numeric_limits<float>::lowest());
-
-    float carry_min_scalar = std::numeric_limits<float>::max();
-    float carry_max_scalar = std::numeric_limits<float>::lowest();
-
-    const int x_start = win.x().start();
-    const int x_end   = win.x().end();
-
-    // Handle X dimension manually to split into two loops
-    // First one will use vector operations, second one processes the left over pixels
-    win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(_input, win);
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int        x      = x_start;
-        const auto in_ptr = reinterpret_cast<const float *>(input.ptr());
-
-        // Vector loop
-        for(; x <= x_end - 8; x += 8)
-        {
-            const float32x4x2_t pixels   = vld2q_f32(in_ptr + x);
-            const float32x4_t   tmp_min1 = vminq_f32(pixels.val[0], pixels.val[1]);
-            const float32x4_t   tmp_max1 = vmaxq_f32(pixels.val[0], pixels.val[1]);
-            const float32x2_t   tmp_min2 = vmin_f32(vget_high_f32(tmp_min1), vget_low_f32(tmp_min1));
-            const float32x2_t   tmp_max2 = vmax_f32(vget_high_f32(tmp_max1), vget_low_f32(tmp_max1));
-            carry_min                    = vmin_f32(tmp_min2, carry_min);
-            carry_max                    = vmax_f32(tmp_max2, carry_max);
-        }
-
-        // Process leftover pixels
-        for(; x < x_end; ++x)
-        {
-            const float pixel = in_ptr[x];
-            carry_min_scalar  = std::min(pixel, carry_min_scalar);
-            carry_max_scalar  = std::max(pixel, carry_max_scalar);
-        }
-
-    },
-    input);
-
-    // Reduce result
-    carry_min = vpmin_f32(carry_min, carry_min);
-    carry_max = vpmax_f32(carry_max, carry_max);
-    carry_min = vpmin_f32(carry_min, carry_min);
-    carry_max = vpmax_f32(carry_max, carry_max);
-
-    // Extract max/min values
-    const float min_i = std::min(vget_lane_f32(carry_min, 0), carry_min_scalar);
-    const float max_i = std::max(vget_lane_f32(carry_max, 0), carry_max_scalar);
-
-    // Perform reduction of local min/max values
-    update_min_max(min_i, max_i);
-}
-
-NEMinMaxLocationKernel::NEMinMaxLocationKernel()
-    : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr)
-{
-}
-
-bool NEMinMaxLocationKernel::is_parallelisable() const
-{
-    return false;
-}
-
-template <class T, std::size_t... N>
-struct NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>
-{
-    static const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> func_table;
-};
-
-template <class T, std::size_t... N>
-const std::array<NEMinMaxLocationKernel::MinMaxLocFunction, sizeof...(N)> NEMinMaxLocationKernel::create_func_table<T, utility::index_sequence<N...>>::func_table
-{
-    &NEMinMaxLocationKernel::minmax_loc<T, bool(N & 8), bool(N & 4), bool(N & 2), bool(N & 1)>...
-};
-
-void NEMinMaxLocationKernel::configure(const IImage *input, void *min, void *max,
-                                       ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc,
-                                       uint32_t *min_count, uint32_t *max_count)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(nullptr == min);
-    ARM_COMPUTE_ERROR_ON(nullptr == max);
-
-    _input     = input;
-    _min       = min;
-    _max       = max;
-    _min_count = min_count;
-    _max_count = max_count;
-    _min_loc   = min_loc;
-    _max_loc   = max_loc;
-
-    unsigned int count_min = (nullptr != min_count ? 1 : 0);
-    unsigned int count_max = (nullptr != max_count ? 1 : 0);
-    unsigned int loc_min   = (nullptr != min_loc ? 1 : 0);
-    unsigned int loc_max   = (nullptr != max_loc ? 1 : 0);
-
-    unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max;
-
-    switch(input->info()->data_type())
-    {
-        case DataType::U8:
-            _func = create_func_table<uint8_t, utility::index_sequence_t<16>>::func_table[table_idx];
-            break;
-        case DataType::S16:
-            _func = create_func_table<int16_t, utility::index_sequence_t<16>>::func_table[table_idx];
-            break;
-        case DataType::F32:
-            _func = create_func_table<float, utility::index_sequence_t<16>>::func_table[table_idx];
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type");
-            break;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-
-    INEKernel::configure(win);
-}
-
-void NEMinMaxLocationKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-
-template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
-void NEMinMaxLocationKernel::minmax_loc(const Window &win)
-{
-    if(count_min || count_max || loc_min || loc_max)
-    {
-        Iterator input(_input, win);
-
-        size_t min_count = 0;
-        size_t max_count = 0;
-
-        // Clear min location array
-        if(loc_min)
-        {
-            _min_loc->clear();
-        }
-
-        // Clear max location array
-        if(loc_max)
-        {
-            _max_loc->clear();
-        }
-
-        using type = typename std::conditional<std::is_same<T, float>::value, float, int32_t>::type;
-
-        auto min_ptr = static_cast<type *>(_min);
-        auto max_ptr = static_cast<type *>(_max);
-
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            auto    in_ptr = reinterpret_cast<const T *>(input.ptr());
-            int32_t idx    = id.x();
-            int32_t idy    = id.y();
-
-            const T       pixel = *in_ptr;
-            Coordinates2D p{ idx, idy };
-
-            if(count_min || loc_min)
-            {
-                if(*min_ptr == pixel)
-                {
-                    if(count_min)
-                    {
-                        ++min_count;
-                    }
-
-                    if(loc_min)
-                    {
-                        _min_loc->push_back(p);
-                    }
-                }
-            }
-
-            if(count_max || loc_max)
-            {
-                if(*max_ptr == pixel)
-                {
-                    if(count_max)
-                    {
-                        ++max_count;
-                    }
-
-                    if(loc_max)
-                    {
-                        _max_loc->push_back(p);
-                    }
-                }
-            }
-        },
-        input);
-
-        if(count_min)
-        {
-            *_min_count = min_count;
-        }
-
-        if(count_max)
-        {
-            *_max_count = max_count;
-        }
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.h b/src/core/NEON/kernels/NEMinMaxLocationKernel.h
deleted file mode 100644
index a24666096f..0000000000
--- a/src/core/NEON/kernels/NEMinMaxLocationKernel.h
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
-#define ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H
-
-#include "arm_compute/core/IArray.h"
-#include "src/core/NEON/INEKernel.h"
-#include "support/Mutex.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-using IImage = ITensor;
-
-/** Interface for the kernel to perform min max search on an image. */
-class NEMinMaxKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxKernel";
-    }
-    /** Default constructor */
-    NEMinMaxKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxKernel(const NEMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxKernel(NEMinMaxKernel &&) = delete;
-    /** Prevent instances of this class from being moved (As this class contains non movable objects) */
-    NEMinMaxKernel &operator=(NEMinMaxKernel &&) = delete;
-    /** Default destructor */
-    ~NEMinMaxKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min   Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max   Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     */
-    void configure(const IImage *input, void *min, void *max);
-    /** Resets global minimum and maximum. */
-    void reset();
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Performs the min/max algorithm on U8 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_U8(Window win);
-    /** Performs the min/max algorithm on S16 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_S16(Window win);
-    /** Performs the min/max algorithm on F32 images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    void minmax_F32(Window win);
-    /** Common signature for all the specialised MinMax functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MinMaxFunction = void (NEMinMaxKernel::*)(Window window);
-    /** MinMax function to use for the particular image types passed to configure() */
-    MinMaxFunction _func;
-    /** Helper to update min/max values **/
-    template <typename T>
-    void update_min_max(T min, T max);
-
-    const IImage      *_input; /**< Input image. */
-    void              *_min;   /**< Minimum value. */
-    void              *_max;   /**< Maximum value. */
-    arm_compute::Mutex _mtx;   /**< Mutex used for result reduction. */
-};
-
-/** Interface for the kernel to find min max locations of an image. */
-class NEMinMaxLocationKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEMinMaxLocationKernel";
-    }
-    /** Default constructor */
-    NEMinMaxLocationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default;
-    /** Default destructor */
-    ~NEMinMaxLocationKernel() = default;
-
-    /** Initialise the kernel's input and outputs.
-     *
-     * @param[in]  input     Input Image. Data types supported: U8/S16/F32.
-     * @param[out] min       Minimum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] max       Maximum value of image. Data types supported: S32 if input type is U8/S16, F32 if input type is F32.
-     * @param[out] min_loc   Array of minimum value locations.
-     * @param[out] max_loc   Array of maximum value locations.
-     * @param[out] min_count Number of minimum value encounters.
-     * @param[out] max_count Number of maximum value encounters.
-     */
-    void configure(const IImage *input, void *min, void *max,
-                   ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr,
-                   uint32_t *min_count = nullptr, uint32_t *max_count = nullptr);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    bool is_parallelisable() const override;
-
-private:
-    /** Performs the min/max location algorithm on T type images on a given window.
-     *
-     * @param win The window to run the algorithm on.
-     */
-    template <class T, bool count_min, bool count_max, bool loc_min, bool loc_max>
-    void minmax_loc(const Window &win);
-    /** Common signature for all the specialised MinMaxLoc functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window);
-    /** MinMaxLoc function to use for the particular image types passed to configure() */
-    MinMaxLocFunction _func;
-    /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */
-    template <class T, typename>
-    struct create_func_table;
-
-    const IImage        *_input;     /**< Input image. */
-    void                *_min;       /**< Minimum value. */
-    void                *_max;       /**< Maximum value. */
-    uint32_t            *_min_count; /**< Count of minimum value encounters. */
-    uint32_t            *_max_count; /**< Count of maximum value encounters. */
-    ICoordinates2DArray *_min_loc;   /**< Locations of minimum values. */
-    ICoordinates2DArray *_max_loc;   /**< Locations of maximum values. */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H */
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
deleted file mode 100644
index 58c0acd404..0000000000
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp
+++ /dev/null
@@ -1,1018 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NENonLinearFilterKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <array>
-#include <tuple>
-#include <utility>
-
-namespace arm_compute
-{
-namespace
-{
-const uint8x16_t zero_u8 = vdupq_n_u8(0);
-
-template <size_t columns>
-inline uint8x8_t min_row(uint8x16_t row_data)
-{
-    uint8x8_t min = vget_low_u8(row_data);
-
-    for(size_t c = 1; c < columns; ++c)
-    {
-        row_data = vextq_u8(row_data, zero_u8, 1);
-        min      = vmin_u8(min, vget_low_u8(row_data));
-    }
-
-    return min;
-}
-
-template <size_t columns>
-inline uint8x8_t max_row(uint8x16_t row_data)
-{
-    uint8x8_t max = vget_low_u8(row_data);
-
-    for(size_t c = 1; c < columns; ++c)
-    {
-        row_data = vextq_u8(row_data, zero_u8, 1);
-        max      = vmax_u8(max, vget_low_u8(row_data));
-    }
-
-    return max;
-}
-
-inline void sort(uint8x8_t &a, uint8x8_t &b)
-{
-    const uint8x8_t min = vmin_u8(a, b);
-    const uint8x8_t max = vmax_u8(a, b);
-    a                   = min;
-    b                   = max;
-}
-
-// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html
-// Calculations that do not affect the median were removed.
-inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4)
-{
-    sort(p0, p1);
-    sort(p2, p3);
-    sort(p0, p2);
-    sort(p1, p3);
-    sort(p1, p2);
-    sort(p0, p4);
-    sort(p1, p4);
-    sort(p2, p4);
-}
-
-inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2,
-                  uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5,
-                  uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8)
-{
-    sort(p1, p2);
-    sort(p4, p5);
-    sort(p7, p8);
-    sort(p0, p1);
-    sort(p3, p4);
-    sort(p6, p7);
-    sort(p1, p2);
-    sort(p4, p5);
-    sort(p7, p8);
-    sort(p0, p3);
-    sort(p5, p8);
-    sort(p4, p7);
-    sort(p3, p6);
-    sort(p1, p4);
-    sort(p2, p5);
-    sort(p4, p7);
-    sort(p4, p2);
-    sort(p6, p4);
-    sort(p4, p2);
-}
-
-inline void sort21(std::array<uint8x8_t, 21> &p)
-{
-    sort(p[0], p[1]);
-    sort(p[2], p[3]);
-    sort(p[4], p[5]);
-    sort(p[6], p[7]);
-    sort(p[8], p[9]);
-    sort(p[10], p[11]);
-    sort(p[12], p[13]);
-    sort(p[14], p[15]);
-    sort(p[16], p[17]);
-    sort(p[18], p[19]);
-    sort(p[0], p[2]);
-    sort(p[1], p[3]);
-    sort(p[4], p[6]);
-    sort(p[5], p[7]);
-    sort(p[8], p[10]);
-    sort(p[9], p[11]);
-    sort(p[12], p[14]);
-    sort(p[13], p[15]);
-    sort(p[16], p[18]);
-    sort(p[17], p[19]);
-    sort(p[1], p[2]);
-    sort(p[5], p[6]);
-    sort(p[0], p[4]);
-    sort(p[3], p[7]);
-    sort(p[9], p[10]);
-    sort(p[13], p[14]);
-    sort(p[8], p[12]);
-    sort(p[11], p[15]);
-    sort(p[17], p[18]);
-    sort(p[16], p[20]);
-    sort(p[1], p[5]);
-    sort(p[2], p[6]);
-    sort(p[9], p[13]);
-    sort(p[10], p[14]);
-    sort(p[0], p[8]);
-    sort(p[7], p[15]);
-    sort(p[17], p[20]);
-    sort(p[1], p[4]);
-    sort(p[3], p[6]);
-    sort(p[9], p[12]);
-    sort(p[11], p[14]);
-    sort(p[18], p[20]);
-    sort(p[0], p[16]);
-    sort(p[2], p[4]);
-    sort(p[3], p[5]);
-    sort(p[10], p[12]);
-    sort(p[11], p[13]);
-    sort(p[1], p[9]);
-    sort(p[6], p[14]);
-    sort(p[19], p[20]);
-    sort(p[3], p[4]);
-    sort(p[11], p[12]);
-    sort(p[1], p[8]);
-    sort(p[2], p[10]);
-    sort(p[5], p[13]);
-    sort(p[7], p[14]);
-    sort(p[3], p[11]);
-    sort(p[2], p[8]);
-    sort(p[4], p[12]);
-    sort(p[7], p[13]);
-    sort(p[1], p[17]);
-    sort(p[3], p[10]);
-    sort(p[5], p[12]);
-    sort(p[1], p[16]);
-    sort(p[2], p[18]);
-    sort(p[3], p[9]);
-    sort(p[6], p[12]);
-    sort(p[2], p[16]);
-    sort(p[3], p[8]);
-    sort(p[7], p[12]);
-    sort(p[5], p[9]);
-    sort(p[6], p[10]);
-    sort(p[4], p[8]);
-    sort(p[7], p[11]);
-    sort(p[3], p[19]);
-    sort(p[5], p[8]);
-    sort(p[7], p[10]);
-    sort(p[3], p[18]);
-    sort(p[4], p[20]);
-    sort(p[6], p[8]);
-    sort(p[7], p[9]);
-    sort(p[3], p[17]);
-    sort(p[5], p[20]);
-    sort(p[7], p[8]);
-    sort(p[3], p[16]);
-    sort(p[6], p[20]);
-    sort(p[5], p[17]);
-    sort(p[7], p[20]);
-    sort(p[4], p[16]);
-    sort(p[6], p[18]);
-    sort(p[5], p[16]);
-    sort(p[7], p[19]);
-    sort(p[7], p[18]);
-    sort(p[6], p[16]);
-    sort(p[7], p[17]);
-    sort(p[10], p[18]);
-    sort(p[7], p[16]);
-    sort(p[9], p[17]);
-    sort(p[8], p[16]);
-    sort(p[9], p[16]);
-    sort(p[10], p[16]);
-}
-
-inline void sort25(std::array<uint8x8_t, 25> &p)
-{
-    sort(p[1], p[2]);
-    sort(p[0], p[1]);
-    sort(p[1], p[2]);
-    sort(p[4], p[5]);
-    sort(p[3], p[4]);
-    sort(p[4], p[5]);
-    sort(p[0], p[3]);
-    sort(p[2], p[5]);
-    sort(p[2], p[3]);
-    sort(p[1], p[4]);
-    sort(p[1], p[2]);
-    sort(p[3], p[4]);
-    sort(p[7], p[8]);
-    sort(p[6], p[7]);
-    sort(p[7], p[8]);
-    sort(p[10], p[11]);
-    sort(p[9], p[10]);
-    sort(p[10], p[11]);
-    sort(p[6], p[9]);
-    sort(p[8], p[11]);
-    sort(p[8], p[9]);
-    sort(p[7], p[10]);
-    sort(p[7], p[8]);
-    sort(p[9], p[10]);
-    sort(p[0], p[6]);
-    sort(p[4], p[10]);
-    sort(p[4], p[6]);
-    sort(p[2], p[8]);
-    sort(p[2], p[4]);
-    sort(p[6], p[8]);
-    sort(p[1], p[7]);
-    sort(p[5], p[11]);
-    sort(p[5], p[7]);
-    sort(p[3], p[9]);
-    sort(p[3], p[5]);
-    sort(p[7], p[9]);
-    sort(p[1], p[2]);
-    sort(p[3], p[4]);
-    sort(p[5], p[6]);
-    sort(p[7], p[8]);
-    sort(p[9], p[10]);
-    sort(p[13], p[14]);
-    sort(p[12], p[13]);
-    sort(p[13], p[14]);
-    sort(p[16], p[17]);
-    sort(p[15], p[16]);
-    sort(p[16], p[17]);
-    sort(p[12], p[15]);
-    sort(p[14], p[17]);
-    sort(p[14], p[15]);
-    sort(p[13], p[16]);
-    sort(p[13], p[14]);
-    sort(p[15], p[16]);
-    sort(p[19], p[20]);
-    sort(p[18], p[19]);
-    sort(p[19], p[20]);
-    sort(p[21], p[22]);
-    sort(p[23], p[24]);
-    sort(p[21], p[23]);
-    sort(p[22], p[24]);
-    sort(p[22], p[23]);
-    sort(p[18], p[21]);
-    sort(p[20], p[23]);
-    sort(p[20], p[21]);
-    sort(p[19], p[22]);
-    sort(p[22], p[24]);
-    sort(p[19], p[20]);
-    sort(p[21], p[22]);
-    sort(p[23], p[24]);
-    sort(p[12], p[18]);
-    sort(p[16], p[22]);
-    sort(p[16], p[18]);
-    sort(p[14], p[20]);
-    sort(p[20], p[24]);
-    sort(p[14], p[16]);
-    sort(p[18], p[20]);
-    sort(p[22], p[24]);
-    sort(p[13], p[19]);
-    sort(p[17], p[23]);
-    sort(p[17], p[19]);
-    sort(p[15], p[21]);
-    sort(p[15], p[17]);
-    sort(p[19], p[21]);
-    sort(p[13], p[14]);
-    sort(p[15], p[16]);
-    sort(p[17], p[18]);
-    sort(p[19], p[20]);
-    sort(p[21], p[22]);
-    sort(p[23], p[24]);
-    sort(p[0], p[12]);
-    sort(p[8], p[20]);
-    sort(p[8], p[12]);
-    sort(p[4], p[16]);
-    sort(p[16], p[24]);
-    sort(p[12], p[16]);
-    sort(p[2], p[14]);
-    sort(p[10], p[22]);
-    sort(p[10], p[14]);
-    sort(p[6], p[18]);
-    sort(p[6], p[10]);
-    sort(p[10], p[12]);
-    sort(p[1], p[13]);
-    sort(p[9], p[21]);
-    sort(p[9], p[13]);
-    sort(p[5], p[17]);
-    sort(p[13], p[17]);
-    sort(p[3], p[15]);
-    sort(p[11], p[23]);
-    sort(p[11], p[15]);
-    sort(p[7], p[19]);
-    sort(p[7], p[11]);
-    sort(p[11], p[13]);
-    sort(p[11], p[12]);
-}
-} // namespace
-
-NENonLinearFilterKernel::NENonLinearFilterKernel()
-    : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size()
-{
-}
-
-BorderSize NENonLinearFilterKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask,
-                                        bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size);
-    ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask);
-
-    // Set class variables
-    _border_size = BorderSize(mask_size / 2);
-    _input       = input;
-    _output      = output;
-    _mask        = mask;
-    _pattern     = pattern;
-    _function    = function;
-
-    // Configure kernel window
-    const unsigned int     num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-
-    Window                 win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size());
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size),
-                              output_access);
-    output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-
-    // Define function index
-    _func_idx = (3 == mask_size) ? 0 : 1;
-
-    if(MatrixPattern::OTHER != pattern)
-    {
-        _func_idx = (_func_idx) * 3 + static_cast<unsigned int>(function);
-    }
-}
-
-void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern)
-{
-    unsigned int v = 0;
-
-    for(int r = 0; r < rows; ++r)
-    {
-        for(int c = 0; c < cols; ++c, ++v)
-        {
-            uint8_t val = 0;
-
-            switch(pattern)
-            {
-                case MatrixPattern::BOX:
-                    val = 255;
-                    break;
-                case MatrixPattern::CROSS:
-                    val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0;
-                    break;
-                case MatrixPattern::DISK:
-                    val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) *
-                            (cols / 2.0f))) <= 1.0f ? 255 : 0;
-                    break;
-                default:
-                    return;
-            }
-
-            mask[v] = val;
-        }
-    }
-}
-
-template <>
-void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, -1)));
-    const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
-    const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 1)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-        uint8x8_t p0 = vget_low_u8(top_data);
-        uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1);
-        uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2);
-        uint8x8_t p3 = vget_low_u8(mid_data);
-        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
-        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
-        uint8x8_t p6 = vget_low_u8(bot_data);
-        uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1);
-        uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2);
-
-        sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-
-        vst1_u8(output.ptr(), p4);
-    },
-    input, output);
-}
-template <>
-void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
-    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset());
-        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset());
-
-        const std::array<uint8x8_t, 10> d =
-        {
-            vget_low_u8(top2_data),
-            vget_high_u8(top2_data),
-            vget_low_u8(top_data),
-            vget_high_u8(top_data),
-            vget_low_u8(mid_data),
-            vget_high_u8(mid_data),
-            vget_low_u8(bot_data),
-            vget_high_u8(bot_data),
-            vget_low_u8(bot2_data),
-            vget_high_u8(bot2_data)
-        };
-
-        std::array<uint8x8_t, 25> p{ 0 };
-        for(unsigned int i = 0; i < 5; ++i)
-        {
-            const unsigned int idx_d = i * 2;
-            const unsigned int idx_p = i * 5;
-
-            p[idx_p]     = d[idx_d];
-            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
-            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
-            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
-            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
-        }
-
-        sort25(p);
-
-        vst1_u8(output.ptr(), p[12]);
-    },
-    input, output);
-}
-
-template <int mask_w, int mask_h>
-void NENonLinearFilterKernel::min_filter_box(const Window &win)
-{
-    static_assert(mask_w > 0, "Mask size must not be 0");
-    static_assert(mask_h > 0, "Mask size must not be 0");
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const int k_row_half = mask_h / 2;
-    const int k_col_half = mask_w / 2;
-
-    // Set row pointers
-    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get min of rows
-        uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset());
-
-        for(unsigned int r = 1; r < mask_h; ++r)
-        {
-            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
-            rows_min              = vminq_u8(rows_min, data);
-        }
-
-        const uint8x8_t out = min_row<mask_w>(rows_min);
-
-        // Store result as U8
-        vst1_u8(output.ptr(), out);
-    },
-    input, output);
-}
-
-template <int mask_w, int mask_h>
-void NENonLinearFilterKernel::max_filter_box(const Window &win)
-{
-    static_assert(mask_w > 0, "Mask size must not be 0");
-    static_assert(mask_h > 0, "Mask size must not be 0");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const int k_row_half = mask_h / 2;
-    const int k_col_half = mask_w / 2;
-
-    // Set row pointers
-    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset());
-
-        // Get max of rows
-        for(unsigned int r = 1; r < mask_h; ++r)
-        {
-            const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset());
-            rows_max              = vmaxq_u8(rows_max, data);
-        }
-
-        // Get max of columns
-        const uint8x8_t out = max_row<mask_w>(rows_max);
-
-        // Store result as U8
-        vst1_u8(output.ptr(), out);
-    },
-    input, output);
-}
-
-template <>
-void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const auto input_top_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
-    const auto input_mid_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-1, 0)));
-    const auto input_bot_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x8_t  top_data = vld1_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x8_t  bot_data = vld1_u8(input_bot_ptr + input.offset());
-
-        uint8x8_t p0 = top_data;
-        uint8x8_t p1 = vget_low_u8(mid_data);
-        uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
-        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
-        uint8x8_t p4 = bot_data;
-
-        sort5(p0, p1, p2, p3, p4);
-
-        vst1_u8(output.ptr(), p2);
-    },
-    input, output);
-}
-
-template <>
-void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const auto input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -2)));
-    const auto input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, -1)));
-    const auto input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 1)));
-    const auto input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(0, 2)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x8_t  top2_data = vld1_u8(input_top2_ptr + input.offset());
-        const uint8x8_t  top_data  = vld1_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x8_t  bot_data  = vld1_u8(input_bot_ptr + input.offset());
-        const uint8x8_t  bot2_data = vld1_u8(input_bot2_ptr + input.offset());
-
-        uint8x8_t p0 = top2_data;
-        uint8x8_t p1 = top_data;
-        uint8x8_t p2 = vget_low_u8(mid_data);
-        uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1);
-        uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2);
-        uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3);
-        uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4);
-        uint8x8_t p7 = bot_data;
-        uint8x8_t p8 = bot2_data;
-
-        sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8);
-
-        vst1_u8(output.ptr(), p4);
-    },
-    input, output);
-}
-
-template <int mask_w, int mask_h>
-void NENonLinearFilterKernel::min_filter_cross(const Window &win)
-{
-    static_assert(mask_w > 0, "Mask size must not be 0");
-    static_assert(mask_h > 0, "Mask size must not be 0");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const int k_row_half = mask_h / 2;
-    const int k_col_half = mask_w / 2;
-
-    const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
-
-    // Set row pointers
-    std::array<const unsigned char *, mask_h> input_ptrs{ {} };
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset());
-
-        // Get min of rows
-        for(unsigned int r = 1; r < mask_h; ++r)
-        {
-            const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
-            rows_min             = vmin_u8(rows_min, data);
-        }
-
-        // Get min of middle row
-        const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
-        uint8x8_t        out  = min_row<mask_w>(data);
-
-        // Get final min
-        out = vmin_u8(out, rows_min);
-
-        // Store result as U8
-        vst1_u8(output.ptr(), out);
-    },
-    input, output);
-}
-
-template <int mask_w, int mask_h>
-void NENonLinearFilterKernel::max_filter_cross(const Window &win)
-{
-    static_assert(mask_w > 0, "Mask size must not be 0");
-    static_assert(mask_h > 0, "Mask size must not be 0");
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    const int k_row_half = mask_h / 2;
-    const int k_col_half = mask_w / 2;
-
-    const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0));
-
-    // Set row pointers
-    std::array<unsigned char *, mask_h> input_ptrs{ {} };
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i));
-    }
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset());
-
-        // Get max of rows
-        for(unsigned int r = 1; r < mask_h; ++r)
-        {
-            const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset());
-            rows_max             = vmax_u8(rows_max, data);
-        }
-
-        // Get max of middle row
-        const uint8x16_t data = vld1q_u8(mid_ptr + input.offset());
-        uint8x8_t        out  = max_row<mask_w>(data);
-
-        // Get final max
-        out = vmax_u8(out, rows_max);
-
-        // Store result as U8
-        vst1_u8(output.ptr(), out);
-    },
-    input, output);
-}
-
-template <>
-void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    static const uint8x16_t zero           = vdupq_n_u8(0);
-    const auto              input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
-    const auto              input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto              input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
-        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
-
-        std::array<uint8x8_t, 10> d =
-        {
-            vget_low_u8(top2_data),
-            vget_high_u8(top2_data),
-            vget_low_u8(top_data),
-            vget_high_u8(top_data),
-            vget_low_u8(mid_data),
-            vget_high_u8(mid_data),
-            vget_low_u8(bot_data),
-            vget_high_u8(bot_data),
-            vget_low_u8(bot2_data),
-            vget_high_u8(bot2_data)
-        };
-
-        std::array<uint8x8_t, 21> p{ 0 };
-        p[0]  = d[0];
-        p[1]  = vext_u8(d[0], d[1], 1);
-        p[2]  = vext_u8(d[0], d[1], 2);
-        p[18] = d[8];
-        p[19] = vext_u8(d[8], d[9], 1);
-        p[20] = vext_u8(d[8], d[9], 2);
-
-        for(unsigned int i = 0; i < 3; ++i)
-        {
-            const unsigned int idx_d = 2 + i * 2;
-            const unsigned int idx_p = 3 + i * 5;
-
-            p[idx_p]     = d[idx_d];
-            p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1);
-            p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2);
-            p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3);
-            p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4);
-        }
-
-        sort21(p);
-
-        vst1_u8(output.ptr(), p[10]);
-    },
-    input, output);
-}
-
-template <>
-void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    static const uint8x16_t zero           = vdupq_n_u8(0);
-    const auto              input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
-    const auto              input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto              input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
-        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
-
-        const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data);
-        uint8x16_t       rows_min_5 = vminq_u8(top_data, bot_data);
-        rows_min_5                  = vminq_u8(rows_min_5, mid_data);
-
-        const uint8x8_t out_3 = min_row<3>(rows_min_3);
-        const uint8x8_t out_5 = min_row<5>(rows_min_5);
-
-        vst1_u8(output.ptr(), vmin_u8(out_3, out_5));
-    },
-    input, output);
-}
-
-template <>
-void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-
-    static const uint8x16_t zero           = vdupq_n_u8(0);
-    const auto              input_top2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -2)));
-    const auto              input_top_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, -1)));
-    const auto              input_mid_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 0)));
-    const auto              input_bot_ptr  = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 1)));
-    const auto              input_bot2_ptr = static_cast<const unsigned char *>(_input->ptr_to_element(Coordinates(-2, 2)));
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const uint8x16_t top2_data = vextq_u8(vld1q_u8(input_top2_ptr + input.offset()), zero, 1);
-        const uint8x16_t top_data  = vld1q_u8(input_top_ptr + input.offset());
-        const uint8x16_t mid_data  = vld1q_u8(input_mid_ptr + input.offset());
-        const uint8x16_t bot_data  = vld1q_u8(input_bot_ptr + input.offset());
-        const uint8x16_t bot2_data = vextq_u8(vld1q_u8(input_bot2_ptr + input.offset()), zero, 1);
-
-        const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data);
-        uint8x16_t       rows_max_5 = vmaxq_u8(top_data, bot_data);
-        rows_max_5                  = vmaxq_u8(rows_max_5, mid_data);
-
-        const uint8x8_t out_3 = max_row<3>(rows_max_3);
-        const uint8x8_t out_5 = max_row<5>(rows_max_5);
-
-        vst1_u8(output.ptr(), vmax_u8(out_3, out_5));
-    },
-    input, output);
-}
-
-template <int mask_w, int mask_h>
-void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win)
-{
-    Iterator input(_input, win);
-    Iterator output(_output, win);
-    ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
-
-    const int     k_row_half = mask_h / 2;
-    const int     k_col_half = mask_w / 2;
-    constexpr int mask_size  = mask_w * mask_h;
-
-    // Set row pointers
-    std::array<unsigned char *, mask_h> input_ptrs{ {} };
-    for(int i = -k_row_half; i <= k_row_half; ++i)
-    {
-        input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i));
-    }
-
-    std::array<uint8_t, mask_size> vals{ {} };
-
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Clear array
-        std::fill(std::begin(vals), std::end(vals), 0);
-
-        size_t v = 0;
-        size_t m = 0;
-
-        for(unsigned int r = 0; r < mask_h; ++r)
-        {
-            const auto in_ptr = static_cast<const uint8_t *>(input_ptrs[r] + input.offset());
-
-            for(unsigned int c = 0; c < mask_w; ++c, ++m)
-            {
-                if(_mask[m] == 255)
-                {
-                    vals[v] = in_ptr[c];
-                    ++v;
-                }
-            }
-        }
-
-        // Only do something if there is at least one non-zero element in the
-        // mask
-        if(v > 0)
-        {
-            std::sort(vals.begin(), vals.begin() + v);
-
-            switch(_function)
-            {
-                case NonLinearFilterFunction::MIN:
-                    *output.ptr() = vals[0];
-                    break;
-                case NonLinearFilterFunction::MAX:
-                    *output.ptr() = vals[v - 1];
-                    break;
-                case NonLinearFilterFunction::MEDIAN:
-                    *output.ptr() = vals[v / 2];
-                    break;
-                default:
-                    break;
-            }
-        }
-    },
-    input, output);
-}
-
-void NENonLinearFilterKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    using NonLinearFilterFunction = void (NENonLinearFilterKernel::*)(const Window & window);
-
-    // Function table for BOX pattern
-    static const std::array<NonLinearFilterFunction, 6> func_table_box =
-    {
-        {
-            &NENonLinearFilterKernel::median_filter_box<3, 3>,
-            &NENonLinearFilterKernel::min_filter_box<3, 3>,
-            &NENonLinearFilterKernel::max_filter_box<3, 3>,
-            &NENonLinearFilterKernel::median_filter_box<5, 5>,
-            &NENonLinearFilterKernel::min_filter_box<5, 5>,
-            &NENonLinearFilterKernel::max_filter_box<5, 5>,
-        }
-    };
-
-    // Function table for CROSS pattern
-    static const std::array<NonLinearFilterFunction, 6> func_table_cross =
-    {
-        {
-            &NENonLinearFilterKernel::median_filter_cross<3, 3>,
-            &NENonLinearFilterKernel::min_filter_cross<3, 3>,
-            &NENonLinearFilterKernel::max_filter_cross<3, 3>,
-            &NENonLinearFilterKernel::median_filter_cross<5, 5>,
-            &NENonLinearFilterKernel::min_filter_cross<5, 5>,
-            &NENonLinearFilterKernel::max_filter_cross<5, 5>,
-        }
-    };
-
-    // Function table for DISK pattern
-    static const std::array<NonLinearFilterFunction, 6> func_table_disk =
-    {
-        {
-            &NENonLinearFilterKernel::median_filter_box<3, 3>,
-            &NENonLinearFilterKernel::min_filter_box<3, 3>,
-            &NENonLinearFilterKernel::max_filter_box<3, 3>,
-            &NENonLinearFilterKernel::median_filter_disk<5, 5>,
-            &NENonLinearFilterKernel::min_filter_disk<5, 5>,
-            &NENonLinearFilterKernel::max_filter_disk<5, 5>,
-        }
-    };
-
-    // Function table for OTHER pattern
-    static const std::array<NonLinearFilterFunction, 2> func_table_generic =
-    {
-        {
-            &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>,
-            &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>,
-        }
-    };
-
-    switch(_pattern)
-    {
-        case MatrixPattern::BOX:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size());
-            (this->*func_table_box[_func_idx])(window);
-            break;
-        case MatrixPattern::CROSS:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size());
-            (this->*func_table_cross[_func_idx])(window);
-            break;
-        case MatrixPattern::DISK:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size());
-            (this->*func_table_disk[_func_idx])(window);
-            break;
-        case MatrixPattern::OTHER:
-        default:
-            ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size());
-            (this->*func_table_generic[_func_idx])(window);
-            break;
-    }
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.h b/src/core/NEON/kernels/NENonLinearFilterKernel.h
deleted file mode 100644
index 3cef12e8ec..0000000000
--- a/src/core/NEON/kernels/NENonLinearFilterKernel.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
-#define ARM_COMPUTE_NENONLINEARFILTERKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include <cstdint>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to apply a non-linear filter */
-class NENonLinearFilterKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NENonLinearFilterKernel";
-    }
-    /** Default constructor */
-    NENonLinearFilterKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default;
-    /** Default destructor */
-    ~NENonLinearFilterKernel() = default;
-    /** Set the source, destination and border mode of the kernel
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8
-     * @param[out] output           Destination tensor. Data type supported: U8
-     * @param[in]  function         Non linear function to perform
-     * @param[in]  mask_size        Mask size. Supported sizes: 3, 5
-     * @param[in]  pattern          Mask pattern
-     * @param[in]  mask             The given mask. Will be used only if pattern is specified to PATTERN_OTHER
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** Fill mask with the corresponding given pattern.
-     *
-     * @param[in,out] mask    Mask to be filled according to pattern
-     * @param[in]     cols    Columns (width) of mask
-     * @param[in]     rows    Rows (height) of mask
-     * @param[in]     pattern Pattern to fill the mask according to
-     */
-    void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern);
-    /** Apply a median filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_box(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_box(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as box.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_box(const Window &win);
-    /** Apply a median filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_cross(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_cross(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as cross.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_cross(const Window &win);
-    /** Apply a median filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void median_filter_disk(const Window &win);
-    /** Apply a min filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void min_filter_disk(const Window &win);
-    /** Apply a max filter when given mask pattern is defined as disk.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void max_filter_disk(const Window &win);
-    /** Apply a non-linear filter when given mask has user-defined pattern.
-     *
-     * @param[in] win Window to apply the filter on.
-     */
-    template <int mask_w, int mask_h>
-    void non_linear_filter_generic(const Window &win);
-
-private:
-    unsigned int            _border_width;
-    const ITensor          *_input;
-    ITensor                *_output;
-    const uint8_t          *_mask;
-    MatrixPattern           _pattern;
-    NonLinearFilterFunction _function;
-    unsigned int            _func_idx;
-    BorderSize              _border_size;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NENONLINEARFILTERKERNEL_H */
diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
index 7b0bc0c720..4194dac68e 100644
--- a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
+++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h
@@ -34,7 +34,6 @@ class ITensor;
 
 /** Interface to perform Non-Maxima suppression over a 3x3 window using Neon
  *
- * @note Used by @ref NEFastCorners and @ref NEHarrisCorners
  */
 class NENonMaximaSuppression3x3Kernel : public INEKernel
 {
diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp
deleted file mode 100644
index b334a11227..0000000000
--- a/src/core/NEON/kernels/NERemapKernel.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NERemapKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride)
-{
-    const float32x4_t lowerxy = vdupq_n_f32(-1.f);
-
-    float32x4_t x = vld1q_f32(mapx_ptr);
-    float32x4_t y = vld1q_f32(mapy_ptr);
-
-    // Clamp x coordinates
-    x = vmaxq_f32(lowerxy, vminq_f32(x, width));
-    y = vmaxq_f32(lowerxy, vminq_f32(y, height));
-
-    const int32x4_t x_s32 = vcvtq_s32_f32(x);
-    const int32x4_t y_s32 = vcvtq_s32_f32(y);
-
-    return vmlaq_s32(x_s32, y_s32, stride);
-}
-
-} // namespace
-
-NERemapKernel::NERemapKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr)
-{
-}
-
-BorderSize NERemapKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32);
-
-    _input  = input;
-    _output = output;
-    _map_x  = map_x;
-    _map_y  = map_y;
-
-    switch(policy)
-    {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            _func = &NERemapKernel::remap_nearest;
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            _func = &NERemapKernel::remap_bilinear;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
-            break;
-    }
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    const int total_right  = ceil_to_multiple(input->info()->dimension(0), num_elems_processed_per_iteration);
-    const int access_right = total_right + (((total_right - input->info()->dimension(0)) == 0) ? border_size().right : 0);
-
-    AccessWindowStatic input_access(input->info(), -border_size().left, -border_size().top, access_right, input->info()->dimension(1) + border_size().bottom);
-
-    AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mapx_access(map_x->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal mapy_access(map_y->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win, input_access, mapx_access, mapy_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-void NERemapKernel::remap_nearest(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-    Iterator mapx(_map_x, window);
-    Iterator mapy(_map_y, window);
-
-    const float32x4_t width     = vdupq_n_f32(static_cast<float>(_input->info()->dimension(0)));
-    const float32x4_t height    = vdupq_n_f32(static_cast<float>(_input->info()->dimension(1)));
-    const int32x4_t   in_stride = vdupq_n_s32(static_cast<int32_t>(_input->info()->strides_in_bytes()[1]));
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const auto     mapx_ptr = reinterpret_cast<const float *>(mapx.ptr());
-        const auto     mapy_ptr = reinterpret_cast<const float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-
-        const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride);
-        const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride);
-        const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride);
-        const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride);
-
-        uint8x16_t tmp = vdupq_n_u8(0);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp, 0);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp, 1);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp, 2);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp, 3);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp, 4);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp, 5);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp, 6);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp, 7);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp, 8);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp, 9);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp, 10);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp, 11);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp, 12);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp, 13);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp, 14);
-        tmp            = vsetq_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp, 15);
-        vst1q_u8(out.ptr(), tmp);
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::remap_bilinear(const Window &window)
-{
-    using namespace scale_helpers;
-
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-    Iterator mapx(_map_x, window);
-    Iterator mapy(_map_y, window);
-
-    const size_t width     = _input->info()->dimension(0);
-    const size_t height    = _input->info()->dimension(1);
-    const size_t in_stride = _input->info()->strides_in_bytes()[1];
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const auto     mapx_ptr = reinterpret_cast<float *>(mapx.ptr());
-        const auto     mapy_ptr = reinterpret_cast<float *>(mapy.ptr());
-        const uint8_t *in_ptr   = in.ptr();
-
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7);
-
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_bilinear_c1_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7);
-
-        vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    in, out, mapx, mapy);
-}
-
-void NERemapKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
diff --git a/src/core/NEON/kernels/NERemapKernel.h b/src/core/NEON/kernels/NERemapKernel.h
deleted file mode 100644
index adc7f4bdd5..0000000000
--- a/src/core/NEON/kernels/NERemapKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEREMAPKERNEL_H
-#define ARM_COMPUTE_NEREMAPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Neon kernel to perform a remap on a tensor */
-class NERemapKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NERemapKernel";
-    }
-    /** Default constructor */
-    NERemapKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel(const NERemapKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NERemapKernel &operator=(const NERemapKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NERemapKernel(NERemapKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NERemapKernel &operator=(NERemapKernel &&) = default;
-    /** Default destructor */
-    ~NERemapKernel() = default;
-
-    /** Initialize the kernel's input, output and border mode.
-     *
-     * @param[in]  input  Source tensor. Data type supported: U8.
-     * @param[in]  map_x  Map for X coordinates. Data type supported: F32.
-     * @param[in]  map_y  Map for Y coordinates. Data type supported: F32.
-     * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane.
-     * @param[in]  policy The interpolation type.
-     */
-    void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    /** function to perform nearest interpolation on the given window */
-    void remap_nearest(const Window &window);
-    /** function to perform bilinear interpolation on the given window */
-    void remap_bilinear(const Window &window);
-    /** Remap function to use for the particular interpolation type passed to configure() */
-    void (NERemapKernel::*_func)(const Window &window);
-
-    const ITensor *_input;  /**< Input image */
-    ITensor       *_output; /**< Output image */
-    const ITensor *_map_x;  /**< Input remap x coordinates */
-    const ITensor *_map_y;  /**< Input remap y coordinates */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREMAPKERNEL_H */
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
deleted file mode 100644
index 58b8caa2b6..0000000000
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEScharr3x3Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace
-{
-const int16x8_t three       = vdupq_n_s16(3);
-const int16x8_t minus_three = vdupq_n_s16(-3);
-const int16x8_t ten         = vdupq_n_s16(10);
-const int16x8_t minus_ten   = vdupq_n_s16(-10);
-
-inline int16x8_t scharr_y(const int16x8x2_t &top, const int16x8x2_t &bottom)
-{
-    // Top left
-    int16x8_t out = vmulq_s16(top.val[0], minus_three);
-    // Top center
-    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 1), minus_ten);
-    // Top right
-    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), minus_three);
-
-    // Bottom left
-    out = vmlaq_s16(out, bottom.val[0], three);
-    // Bottom center
-    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 1), ten);
-    // Bottom right
-    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
-
-    return out;
-}
-
-inline int16x8_t scharr_x(const int16x8x2_t &top, const int16x8x2_t &middle, const int16x8x2_t &bottom)
-{
-    // Top left
-    int16x8_t out = vmulq_s16(top.val[0], minus_three);
-    // Top right
-    out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), three);
-
-    // Middle left
-    out = vmlaq_s16(out, middle.val[0], minus_ten);
-    // Middle right
-    out = vmlaq_s16(out, vextq_s16(middle.val[0], middle.val[1], 2), ten);
-
-    // Bottom left
-    out = vmlaq_s16(out, bottom.val[0], minus_three);
-    // Bottom right
-    out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
-
-    return out;
-}
-} // namespace
-
-NEScharr3x3Kernel::NEScharr3x3Kernel()
-    : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
-{
-}
-
-void NEScharr3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_scharr_x = output_x != nullptr;
-    _run_scharr_y = output_y != nullptr;
-
-    if(_run_scharr_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_scharr_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-BorderSize NEScharr3x3Kernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void NEScharr3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
-    const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
-    const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
-
-    Iterator input(_input, window);
-    Iterator output_y;
-    Iterator output_x;
-
-    if(_run_scharr_y)
-    {
-        output_y = Iterator(_output_y, window);
-    }
-
-    if(_run_scharr_x)
-    {
-        output_x = Iterator(_output_x, window);
-    }
-
-    if(_run_scharr_x && _run_scharr_y)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-
-            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-            const int16x8x2_t top_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-                }
-            };
-            const int16x8x2_t mid_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
-                }
-            };
-            const int16x8x2_t bot_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-                }
-            };
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
-            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
-        },
-        input, output_x, output_y);
-    }
-    else if(_run_scharr_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-
-            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-            const int16x8x2_t top_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-                }
-            };
-            const int16x8x2_t mid_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
-                }
-            };
-            const int16x8x2_t bot_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-                }
-            };
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
-        },
-        input, output_x);
-    }
-    else if(_run_scharr_y)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-
-            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-            const int16x8x2_t top_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-                }
-            };
-            const int16x8x2_t bot_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-                }
-            };
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
-        },
-        input, output_y);
-    }
-}
diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.h b/src/core/NEON/kernels/NEScharr3x3Kernel.h
deleted file mode 100644
index 920410ebb3..0000000000
--- a/src/core/NEON/kernels/NEScharr3x3Kernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESCHARR3x3KERNEL_H
-#define ARM_COMPUTE_NESCHARR3x3KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 Scharr filter on a tensor.
- *
-* @f[
-*      \mathbf{G}_x=\begin{vmatrix}
-*      -3 & 0 & +3\\
-*      -10& 0 & +10\\
-*      -3 & 0 & +3
-*      \end{vmatrix}
-* @f]
-*/
-class NEScharr3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEScharr3x3Kernel";
-    }
-    /** Default constructor */
-    NEScharr3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NEScharr3x3Kernel() = default;
-
-    /** Initialise the kernel's source, destination and border.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    bool           _run_scharr_x; /**< Do we need to run Scharr X ? */
-    bool           _run_scharr_y; /**< Do we need to run Scharr Y ? */
-    const ITensor *_input;        /**< Input tensor */
-    ITensor       *_output_x;     /**< Output tensor for scharr X */
-    ITensor       *_output_y;     /**< Output tensor for scharr Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESCHARR3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp
deleted file mode 100644
index ecf6b59c29..0000000000
--- a/src/core/NEON/kernels/NESobel3x3Kernel.cpp
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NESobel3x3Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
-
-NESobel3x3Kernel::NESobel3x3Kernel()
-    : _run_sobel_x(false), _run_sobel_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
-{
-}
-
-BorderSize NESobel3x3Kernel::border_size() const
-{
-    return BorderSize{ 1 };
-}
-
-void NESobel3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input    = input;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 3;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NESobel3x3Kernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
-    const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
-    const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, 1));
-
-    Iterator input(_input, window);
-    Iterator output_y;
-    Iterator output_x;
-
-    if(_run_sobel_y)
-    {
-        output_y = Iterator(_output_y, window);
-    }
-
-    if(_run_sobel_x)
-    {
-        output_x = Iterator(_output_x, window);
-    }
-
-    static const int16x8_t two      = vdupq_n_s16(2);
-    static const int16x8_t minustwo = vdupq_n_s16(-2);
-
-    if(_run_sobel_y && _run_sobel_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-            const int16x8x2_t top_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-                }
-            };
-            const int16x8x2_t mid_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
-                }
-            };
-            const int16x8x2_t bot_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-                }
-            };
-
-            //SOBEL Y
-            //top left
-            int16x8_t out_y = vnegq_s16(top_s16.val[0]);
-            //top mid
-            out_y = vmlaq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
-            //top right
-            out_y = vsubq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
-            //bot left
-            out_y = vaddq_s16(out_y, bot_s16.val[0]);
-            //bot mid
-            out_y = vmlaq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
-            //bot right
-            out_y = vaddq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
-
-            //SOBEL X
-            //top left
-            int16x8_t out_x = vnegq_s16(top_s16.val[0]);
-            //top right
-            out_x = vaddq_s16(out_x, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
-            //mid left
-            out_x = vmlaq_s16(out_x, mid_s16.val[0], minustwo);
-            //mid right
-            out_x = vmlaq_s16(out_x, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
-            //bot left
-            out_x = vsubq_s16(out_x, bot_s16.val[0]);
-            //bot right
-            out_x = vaddq_s16(out_x, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
-        },
-        input, output_x, output_y);
-    }
-    else if(_run_sobel_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-            const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
-            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-            const int16x8x2_t top_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-                }
-            };
-            const int16x8x2_t mid_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
-                }
-            };
-            const int16x8x2_t bot_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-                }
-            };
-
-            //SOBEL X
-            //top left
-            int16x8_t out = vnegq_s16(top_s16.val[0]);
-            //top right
-            out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
-            //mid left
-            out = vmlaq_s16(out, mid_s16.val[0], minustwo);
-            //mid right
-            out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two);
-            //bot left
-            out = vsubq_s16(out, bot_s16.val[0]);
-            //bot right
-            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
-        },
-        input, output_x);
-    }
-    else if(_run_sobel_y)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
-            const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
-
-            const int16x8x2_t top_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
-                }
-            };
-            const int16x8x2_t bot_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
-                }
-            };
-
-            //SOBEL Y
-            //top left
-            int16x8_t out = vnegq_s16(top_s16.val[0]);
-            //top mid
-            out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo);
-            //top right
-            out = vsubq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2));
-            //bot left
-            out = vaddq_s16(out, bot_s16.val[0]);
-            //bot mid
-            out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two);
-            //bot right
-            out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
-        },
-        input, output_y);
-    }
-}
diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.h b/src/core/NEON/kernels/NESobel3x3Kernel.h
deleted file mode 100644
index 2c3eaf5eb7..0000000000
--- a/src/core/NEON/kernels/NESobel3x3Kernel.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL3x3KERNEL_H
-#define ARM_COMPUTE_NESOBEL3x3KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor.
- *
- * @f[
- *      \mathbf{G}_x=\begin{vmatrix}
- *      -1 & 0 & +1\\
- *      -2 & 0 & +2\\
- *      -1 & 0 & +1
- *      \end{vmatrix}
- * @f]
-*/
-class NESobel3x3Kernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel3x3Kernel";
-    }
-    /** Default constructor */
-    NESobel3x3Kernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel3x3Kernel(const NESobel3x3Kernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel3x3Kernel(NESobel3x3Kernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default;
-    /** Default destructor */
-    ~NESobel3x3Kernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    bool           _run_sobel_x; /**< Do we need to run Sobel X ? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y ? */
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< Output tensor for sobel X */
-    ITensor       *_output_y;    /**< Output tensor for sobel Y */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL3x3KERNEL_H */
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp
deleted file mode 100644
index 5a66b1f364..0000000000
--- a/src/core/NEON/kernels/NESobel5x5Kernel.cpp
+++ /dev/null
@@ -1,406 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NESobel5x5Kernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-NESobel5x5HorKernel::NESobel5x5HorKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize NESobel5x5HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16);
-    }
-
-    _input       = input;
-    _output_x    = output_x;
-    _output_y    = output_y;
-    _border_size = BorderSize(border_undefined ? 0 : 2, 2);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NESobel5x5HorKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Window win_in(window);
-    win_in.shift(Window::DimX, -2);
-
-    Iterator input(_input, win_in);
-    Iterator output_x;
-    Iterator output_y;
-
-    if(_run_sobel_x)
-    {
-        output_x = Iterator(_output_x, window);
-    }
-
-    if(_run_sobel_y)
-    {
-        output_y = Iterator(_output_y, window);
-    }
-
-    if(_run_sobel_y && _run_sobel_x)
-    {
-        static const int16x8_t six      = vdupq_n_s16(6);
-        static const int16x8_t four     = vdupq_n_s16(4);
-        static const int16x8_t two      = vdupq_n_s16(2);
-        static const int16x8_t minustwo = vdupq_n_s16(-2);
-
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t data = vld1q_u8(input.ptr());
-
-            const int16x8x2_t data_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-                }
-            };
-
-            int16x8_t out_y = data_s16.val[0];
-            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
-            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
-            out_y           = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
-            out_y           = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out_y);
-
-            int16x8_t out_x = vnegq_s16(data_s16.val[0]);
-            out_x           = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
-            out_x           = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
-            out_x           = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out_x);
-        },
-        input, output_x, output_y);
-    }
-    else if(_run_sobel_x)
-    {
-        static const int16x8_t two      = vdupq_n_s16(2);
-        static const int16x8_t minustwo = vdupq_n_s16(-2);
-
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t data = vld1q_u8(input.ptr());
-
-            const int16x8x2_t data_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-                }
-            };
-
-            int16x8_t out = vnegq_s16(data_s16.val[0]);
-            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo);
-            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two);
-            out           = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), out);
-        },
-        input, output_x);
-    }
-    else if(_run_sobel_y)
-    {
-        static const int16x8_t six  = vdupq_n_s16(6);
-        static const int16x8_t four = vdupq_n_s16(4);
-
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t data = vld1q_u8(input.ptr());
-
-            const int16x8x2_t data_s16 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data)))
-                }
-            };
-
-            int16x8_t out = data_s16.val[0];
-            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four);
-            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six);
-            out           = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four);
-            out           = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4));
-
-            vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), out);
-        },
-        input, output_y);
-    }
-}
-
-NESobel5x5VertKernel::NESobel5x5VertKernel()
-    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize NESobel5x5VertKernel::border_size() const
-{
-    return BorderSize{ 2, 0 };
-}
-
-void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S16);
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S16);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S16);
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S16);
-    }
-
-    _input_x  = input_x;
-    _input_y  = input_y;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    const ITensor *const input = _run_sobel_x ? input_x : input_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 16;
-    constexpr unsigned int num_rows_read_per_iteration       = 5;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NESobel5x5VertKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Iterator input_x;
-    Iterator input_y;
-    Iterator output_x;
-    Iterator output_y;
-
-    const int16_t *input_x_low2_ptr = nullptr;
-    const int16_t *input_x_low_ptr  = nullptr;
-    const int16_t *input_x_mid_ptr  = nullptr;
-    const int16_t *input_x_top_ptr  = nullptr;
-    const int16_t *input_x_top2_ptr = nullptr;
-
-    const int16_t *input_y_low2_ptr = nullptr;
-    const int16_t *input_y_low_ptr  = nullptr;
-    const int16_t *input_y_top_ptr  = nullptr;
-    const int16_t *input_y_top2_ptr = nullptr;
-
-    if(_run_sobel_x)
-    {
-        input_x          = Iterator(_input_x, window);
-        output_x         = Iterator(_output_x, window);
-        input_x_top2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -2)));
-        input_x_top_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, -1)));
-        input_x_mid_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 0)));
-        input_x_low_ptr  = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 1)));
-        input_x_low2_ptr = reinterpret_cast<const int16_t *>(_input_x->ptr_to_element(Coordinates(0, 2)));
-    }
-
-    if(_run_sobel_y)
-    {
-        input_y          = Iterator(_input_y, window);
-        output_y         = Iterator(_output_y, window);
-        input_y_top2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -2)));
-        input_y_top_ptr  = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, -1)));
-        input_y_low_ptr  = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 1)));
-        input_y_low2_ptr = reinterpret_cast<const int16_t *>(_input_y->ptr_to_element(Coordinates(0, 2)));
-    }
-
-    static const int16x8_t six      = vdupq_n_s16(6);
-    static const int16x8_t four     = vdupq_n_s16(4);
-    static const int16x8_t two      = vdupq_n_s16(2);
-    static const int16x8_t minustwo = vdupq_n_s16(-2);
-
-    if(_run_sobel_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            // Convert offset from uint8_t* to uint16_t*
-            const size_t input_offset_high_s16 = input_x.offset() / 2;
-            const size_t input_offset_low_s16  = input_offset_high_s16 + 8;
-
-            //HIGH DATA
-            //top2
-            int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16);
-            int16x8_t out_high  = data_high;
-            //top
-            data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16);
-            out_high  = vmlaq_s16(out_high, data_high, four);
-            //mid
-            data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16);
-            out_high  = vmlaq_s16(out_high, data_high, six);
-            //low
-            data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16);
-            out_high  = vmlaq_s16(out_high, data_high, four);
-            //low2
-            data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16);
-            out_high  = vaddq_s16(out_high, data_high);
-
-            vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())), out_high);
-
-            //LOW DATA
-            //top2
-            int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16);
-            int16x8_t out_low  = data_low;
-            //top
-            data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16);
-            out_low  = vmlaq_s16(out_low, data_low, four);
-            //mid
-            data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16);
-            out_low  = vmlaq_s16(out_low, data_low, six);
-            //low
-            data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16);
-            out_low  = vmlaq_s16(out_low, data_low, four);
-            //low2
-            data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16);
-            out_low  = vaddq_s16(out_low, data_low);
-
-            vst1q_s16((reinterpret_cast<int16_t *>(output_x.ptr())) + 8, out_low);
-        },
-        input_x, output_x);
-    }
-
-    if(_run_sobel_y)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            // Convert offset from uint8_t* to uint16_t*
-            const size_t input_offset_high_s16 = input_y.offset() / 2;
-            const size_t input_offset_low_s16  = input_offset_high_s16 + 8;
-
-            //HIGH DATA
-            //top2
-            int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16);
-            int16x8_t out_high  = vnegq_s16(data_high);
-            //top
-            data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16);
-            out_high  = vmlaq_s16(out_high, data_high, minustwo);
-            //low
-            data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16);
-            out_high  = vmlaq_s16(out_high, data_high, two);
-            //low2
-            data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16);
-            out_high  = vaddq_s16(out_high, data_high);
-
-            vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())), out_high);
-
-            //LOW DATA
-            //top2
-            int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16);
-            int16x8_t out_low  = vnegq_s16(data_low);
-            //top
-            data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16);
-            out_low  = vmlaq_s16(out_low, data_low, minustwo);
-            //low
-            data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16);
-            out_low  = vmlaq_s16(out_low, data_low, two);
-            //low2
-            data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16);
-            out_low  = vaddq_s16(out_low, data_low);
-
-            vst1q_s16((reinterpret_cast<int16_t *>(output_y.ptr())) + 8, out_low);
-        },
-        input_y, output_y);
-    }
-}
diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.h b/src/core/NEON/kernels/NESobel5x5Kernel.h
deleted file mode 100644
index bd5eb29296..0000000000
--- a/src/core/NEON/kernels/NESobel5x5Kernel.h
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL5x5KERNEL_H
-#define ARM_COMPUTE_NESOBEL5x5KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor.
- *
- */
-class NESobel5x5HorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel5x5HorKernel";
-    }
-    /** Default constructor */
-    NESobel5x5HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default;
-    /** Default destructor */
-    ~NESobel5x5HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< X output of horizontal pass */
-    ITensor       *_output_y;    /**< Y output of horizontal pass */
-    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
-    BorderSize     _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor.
- *
-*/
-class NESobel5x5VertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel5x5VertKernel";
-    }
-    /** Default constructor */
-    NESobel5x5VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default;
-    /** Default destructor */
-    ~NESobel5x5VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @param[in]  input_x          Input for X (X output of hor pass). Data type supported: S16.
-     * @param[in]  input_y          Input for Y (Y output of hor pass). Data type supported: S16.
-     * @param[out] output_x         Destination tensor for the X gradient. Data type supported: S16.
-     * @param[out] output_y         Destination tensor for the Y gradient. Data type supported: S16.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    ITensor *_input_x;     /**< X input (X output of the hor pass) */
-    ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
-    ITensor *_output_x;    /**< X output of sobel */
-    ITensor *_output_y;    /**< Y output of sobel */
-    bool     _run_sobel_x; /**< Do we need to run sobel X? */
-    bool     _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL5x5KERNEL_H */
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp
deleted file mode 100644
index 835b333a10..0000000000
--- a/src/core/NEON/kernels/NESobel7x7Kernel.cpp
+++ /dev/null
@@ -1,524 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NESobel7x7Kernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-const int32x4_t minusfour = vdupq_n_s32(-4);
-const int32x4_t minusfive = vdupq_n_s32(-5);
-const int32x4_t four      = vdupq_n_s32(4);
-const int32x4_t five      = vdupq_n_s32(5);
-const int32x4_t six       = vdupq_n_s32(6);
-const int32x4_t fifteen   = vdupq_n_s32(15);
-const int32x4_t twenty    = vdupq_n_s32(20);
-
-inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data)
-{
-    int32x4x2_t out =
-    {
-        {
-            vnegq_s32(data.val[0]),
-            vnegq_s32(data.val[1])
-        }
-    };
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[0], data.val[1], 1), minusfour);
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[0], data.val[1], 2), minusfive);
-
-    out.val[0] = vmlaq_s32(out.val[0], data.val[1], five);
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[1], data.val[2], 1), four);
-
-    out.val[0] = vaddq_s32(out.val[0],
-                           vextq_s32(data.val[1], data.val[2], 2));
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[1], data.val[2], 1), minusfour);
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[1], data.val[2], 2), minusfive);
-
-    out.val[1] = vmlaq_s32(out.val[1], data.val[2], five);
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[2], data.val[3], 1), four);
-
-    out.val[1] = vaddq_s32(out.val[1],
-                           vextq_s32(data.val[2], data.val[3], 2));
-
-    return out;
-}
-
-inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data)
-{
-    int32x4x2_t out =
-    {
-        {
-            data.val[0],
-            data.val[1]
-        }
-    };
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[0], data.val[1], 1), six);
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[0], data.val[1], 2), fifteen);
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[0], data.val[1], 3), twenty);
-
-    out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen);
-
-    out.val[0] = vmlaq_s32(out.val[0],
-                           vextq_s32(data.val[1], data.val[2], 1), six);
-
-    out.val[0] = vaddq_s32(out.val[0],
-                           vextq_s32(data.val[1], data.val[2], 2));
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[1], data.val[2], 1), six);
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[1], data.val[2], 2), fifteen);
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[1], data.val[2], 3), twenty);
-
-    out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen);
-
-    out.val[1] = vmlaq_s32(out.val[1],
-                           vextq_s32(data.val[2], data.val[3], 1), six);
-
-    out.val[1] = vaddq_s32(out.val[1],
-                           vextq_s32(data.val[2], data.val[3], 2));
-
-    return out;
-}
-} // namespace
-
-NESobel7x7HorKernel::NESobel7x7HorKernel()
-    : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0)
-{
-}
-
-BorderSize NESobel7x7HorKernel::border_size() const
-{
-    return _border_size;
-}
-
-void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8);
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = output_x != nullptr;
-    _run_sobel_y = output_y != nullptr;
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
-    }
-
-    _input       = input;
-    _output_x    = output_x;
-    _output_y    = output_y;
-    _border_size = BorderSize(border_undefined ? 0 : 3, 3);
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 16;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-
-    Window                 win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration),
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NESobel7x7HorKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Iterator input(_input, window);
-    Iterator output_x;
-    Iterator output_y;
-
-    if(_run_sobel_x)
-    {
-        output_x = Iterator(_output_x, window);
-    }
-
-    if(_run_sobel_y)
-    {
-        output_y = Iterator(_output_y, window);
-    }
-
-    if(_run_sobel_y && _run_sobel_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
-
-            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
-            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
-
-            const int32x4x4_t data_s32 =
-            {
-                {
-                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
-                }
-            };
-
-            const int32x4x2_t out_y = compute_hor_sobel_y(data_s32);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out_y.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out_y.val[1]);
-
-            const int32x4x2_t out_x = compute_hor_sobel_x(data_s32);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out_x.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out_x.val[1]);
-        },
-        input, output_x, output_y);
-    }
-    else if(_run_sobel_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
-
-            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
-            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
-
-            const int32x4x4_t data_s32 =
-            {
-                {
-                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
-                }
-            };
-
-            const int32x4x2_t out = compute_hor_sobel_x(data_s32);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()), out.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
-        },
-        input, output_x);
-    }
-    else if(_run_sobel_y)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            const uint8x16_t data = vld1q_u8(input.ptr() - 3);
-
-            const uint16x8_t tmp_low_u16  = vmovl_u8(vget_low_u8(data));
-            const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data));
-
-            const int32x4x4_t data_s32 =
-            {
-                {
-                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))),
-                    vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16)))
-                }
-            };
-
-            const int32x4x2_t out = compute_hor_sobel_y(data_s32);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()), out.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
-        },
-        input, output_y);
-    }
-}
-
-NESobel7x7VertKernel::NESobel7x7VertKernel()
-    : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false)
-{
-}
-
-BorderSize NESobel7x7VertKernel::border_size() const
-{
-    return BorderSize{ 3, 0 };
-}
-
-void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined)
-{
-    ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
-
-    _run_sobel_x = (output_x != nullptr);
-    _run_sobel_y = (output_y != nullptr);
-
-    if(_run_sobel_x)
-    {
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32);
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32);
-    }
-
-    if(_run_sobel_y)
-    {
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32);
-        ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32);
-    }
-
-    _input_x  = input_x;
-    _input_y  = input_y;
-    _output_x = output_x;
-    _output_y = output_y;
-
-    const ITensor *const input = _run_sobel_x ? input_x : input_y;
-
-    // Configure kernel window
-    constexpr unsigned int num_elems_processed_per_iteration = 8;
-    constexpr unsigned int num_elems_read_per_iteration      = 8;
-    constexpr unsigned int num_elems_written_per_iteration   = 8;
-    constexpr unsigned int num_rows_read_per_iteration       = 7;
-
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
-    AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
-    AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
-
-    update_window_and_padding(win,
-                              AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
-                              output_x_access,
-                              output_y_access);
-
-    output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-    output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
-
-    INEKernel::configure(win);
-}
-
-void NESobel7x7VertKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-    Iterator input_x;
-    Iterator input_y;
-    Iterator output_x;
-    Iterator output_y;
-
-    int32_t in_x_stride = 0;
-    int32_t in_y_stride = 0;
-
-    if(_run_sobel_x)
-    {
-        input_x     = Iterator(_input_x, window);
-        output_x    = Iterator(_output_x, window);
-        in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format());
-    }
-
-    if(_run_sobel_y)
-    {
-        input_y     = Iterator(_input_y, window);
-        output_y    = Iterator(_output_y, window);
-        in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format());
-    }
-
-    if(_run_sobel_x)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto in_ptr = reinterpret_cast<int32_t *>(input_x.ptr()) - 3 * in_x_stride;
-
-            //top3
-            int32x4x2_t data =
-            {
-                {
-                    vld1q_s32(in_ptr),
-                    vld1q_s32(in_ptr + 4)
-                }
-            };
-
-            int32x4x2_t out = data;
-
-            //top2
-            in_ptr += in_x_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], six);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], six);
-
-            //top
-            in_ptr += in_x_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], fifteen);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], fifteen);
-
-            //mid
-            in_ptr += in_x_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], twenty);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], twenty);
-
-            //low
-            in_ptr += in_x_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], fifteen);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], fifteen);
-
-            //low2
-            in_ptr += in_x_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], six);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], six);
-
-            //low3
-            in_ptr += in_x_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vaddq_s32(out.val[0], data.val[0]);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vaddq_s32(out.val[1], data.val[1]);
-
-            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 0, out.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_x.ptr()) + 4, out.val[1]);
-        },
-        input_x, output_x);
-    }
-
-    if(_run_sobel_y)
-    {
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto in_ptr = reinterpret_cast<int32_t *>(input_y.ptr()) - 3 * in_y_stride;
-
-            //top3
-            int32x4x2_t data =
-            {
-                {
-                    vld1q_s32(in_ptr),
-                    vld1q_s32(in_ptr + 4)
-                }
-            };
-
-            int32x4x2_t out =
-            {
-                {
-                    vnegq_s32(data.val[0]),
-                    vnegq_s32(data.val[1])
-                }
-            };
-
-            //top2
-            in_ptr += in_y_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], minusfour);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], minusfour);
-
-            //top
-            in_ptr += in_y_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], minusfive);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], minusfive);
-
-            //low
-            in_ptr += (2 * in_y_stride);
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], five);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], five);
-
-            //low2
-            in_ptr += in_y_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vmlaq_s32(out.val[0], data.val[0], four);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vmlaq_s32(out.val[1], data.val[1], four);
-
-            //low3
-            in_ptr += in_y_stride;
-            data.val[0] = vld1q_s32(in_ptr);
-            out.val[0]  = vaddq_s32(out.val[0], data.val[0]);
-
-            data.val[1] = vld1q_s32(in_ptr + 4);
-            out.val[1]  = vaddq_s32(out.val[1], data.val[1]);
-
-            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 0, out.val[0]);
-            vst1q_s32(reinterpret_cast<int32_t *>(output_y.ptr()) + 4, out.val[1]);
-        },
-        input_y, output_y);
-    }
-}
diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.h b/src/core/NEON/kernels/NESobel7x7Kernel.h
deleted file mode 100644
index c5a3899bab..0000000000
--- a/src/core/NEON/kernels/NESobel7x7Kernel.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESOBEL7x7KERNEL_H
-#define ARM_COMPUTE_NESOBEL7x7KERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor.
- *
- */
-class NESobel7x7HorKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel7x7HorKernel";
-    }
-    /** Default constructor */
-    NESobel7x7HorKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default;
-    /** Default destructor */
-    ~NESobel7x7HorKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set.
-     *
-     * @param[in]  input            Source tensor. Data type supported: U8.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input;       /**< Input tensor */
-    ITensor       *_output_x;    /**< X output of horizontal pass */
-    ITensor       *_output_y;    /**< Y output of horizontal pass */
-    bool           _run_sobel_x; /**< Do we need to run Sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run Sobel Y? */
-    BorderSize     _border_size; /**< Border size */
-};
-
-/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor.
- *
-*/
-class NESobel7x7VertKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NESobel7x7VertKernel";
-    }
-    /** Default constructor */
-    NESobel7x7VertKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default;
-    /** Default destructor */
-    ~NESobel7x7VertKernel() = default;
-
-    /** Initialise the kernel's source, destination and border mode.
-     *
-     * @note At least one of output_x or output_y must be set
-     * @note If output_x is set then input_x must be set too
-     * @note If output_y is set then input_y must be set too
-     *
-     * @param[in]  input_x          (Optional) Input for X (X output of hor pass). Data type supported: S32.
-     * @param[in]  input_y          (Optional) Input for Y (Y output of hor pass). Data type supported: S32.
-     * @param[out] output_x         (Optional) Destination tensor for the X gradient. Data type supported: S32.
-     * @param[out] output_y         (Optional) Destination tensor for the Y gradient. Data type supported: S32.
-     * @param[in]  border_undefined True if the border mode is undefined. False if it's replicate or constant.
-     */
-    void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
-
-private:
-    const ITensor *_input_x;     /**< X input (X output of the hor pass) */
-    const ITensor *_input_y;     /**< Y input (Y output of the hor pass) */
-    ITensor       *_output_x;    /**< X output of sobel */
-    ITensor       *_output_y;    /**< Y output of sobel */
-    bool           _run_sobel_x; /**< Do we need to run sobel X? */
-    bool           _run_sobel_y; /**< Do we need to run sobel Y? */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NESOBEL7x7KERNEL_H */
diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp
deleted file mode 100644
index 19ce7f0352..0000000000
--- a/src/core/NEON/kernels/NETableLookupKernel.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NETableLookupKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ILut.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-
-#include <cstddef>
-#include <cstdint>
-
-using namespace arm_compute;
-
-namespace arm_compute
-{
-class Coordinates;
-
-constexpr unsigned int num_num_elems_processed_per_iteration = 16;
-} // namespace arm_compute
-
-NETableLookupKernel::NETableLookupKernel()
-    : _func(nullptr), _lut(nullptr)
-{
-}
-
-template <class T>
-void NETableLookupKernel::tableLookup(const Window &window)
-{
-    uint32_t     offset = _lut->index_offset();
-    size_t       count  = _lut->num_elements();
-    const auto   lut    = reinterpret_cast<const T *>(_lut->buffer());
-    unsigned int step   = num_num_elems_processed_per_iteration;
-
-    ARM_COMPUTE_ERROR_ON(lut == nullptr);
-
-    Iterator input  = Iterator(_input, window);
-    Iterator output = Iterator(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        for(unsigned int i = 0; i < step; ++i, ++input_ptr, ++output_ptr)
-        {
-            const int32_t index = offset + *input_ptr;
-
-            if(0 <= index && index < static_cast<int32_t>(count))
-            {
-                *output_ptr = lut[index];
-            }
-        }
-    },
-    input, output);
-}
-
-namespace arm_compute
-{
-template <>
-void NETableLookupKernel::tableLookup<uint8_t>(const Window &window)
-{
-    const uint8_t *const lut  = _lut->buffer();
-    unsigned int         step = num_num_elems_processed_per_iteration;
-
-    ARM_COMPUTE_ERROR_ON(lut == nullptr);
-
-    Iterator input  = Iterator(_input, window);
-    Iterator output = Iterator(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        const uint8_t *input_ptr  = input.ptr();
-        uint8_t       *output_ptr = output.ptr();
-
-        for(unsigned int i = 0; i < step; ++i)
-        {
-            *output_ptr++ = lut[*input_ptr++];
-        }
-    },
-    input, output);
-}
-} // namespace arm_compute
-
-void NETableLookupKernel::configure(const ITensor *input, const ILut *lut, ITensor *output)
-{
-    ARM_COMPUTE_ERROR_ON(input == nullptr);
-    ARM_COMPUTE_ERROR_ON(lut == nullptr);
-    ARM_COMPUTE_ERROR_ON(output == nullptr);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-
-    _lut = lut;
-
-    if(input->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::U8)
-    {
-        _func = &NETableLookupKernel::tableLookup<uint8_t>;
-    }
-    else if(input->info()->data_type() == DataType::S16 && output->info()->data_type() == DataType::S16)
-    {
-        _func = &NETableLookupKernel::tableLookup<int16_t>;
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Unsupported combination of input and output DataType.");
-    }
-
-    INESimpleKernel::configure(input, output, num_num_elems_processed_per_iteration);
-}
-
-void NETableLookupKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    (this->*_func)(window);
-}
diff --git a/src/core/NEON/kernels/NETableLookupKernel.h b/src/core/NEON/kernels/NETableLookupKernel.h
deleted file mode 100644
index 7937999b46..0000000000
--- a/src/core/NEON/kernels/NETableLookupKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETABLELOOKUPKERNEL_H
-#define ARM_COMPUTE_NETABLELOOKUPKERNEL_H
-
-#include "src/core/NEON/INESimpleKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-class ILut;
-
-/** Interface for the kernel to perform table lookup calculations. */
-class NETableLookupKernel : public INESimpleKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NETableLookupKernel";
-    }
-    /** Default constructor */
-    NETableLookupKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETableLookupKernel(const NETableLookupKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NETableLookupKernel &operator=(const NETableLookupKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NETableLookupKernel(NETableLookupKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NETableLookupKernel &operator=(NETableLookupKernel &&) = default;
-    /** Default destructor */
-    ~NETableLookupKernel() = default;
-    /** Initialise the kernel's input, lut and output.
-     *
-     * @param[in]  input  An input tensor. Data types supported: U8/S16.
-     * @param[in]  lut    The input LUT.
-     * @param[out] output The output tensor. Data types supported: same as @p input
-     */
-    void configure(const ITensor *input, const ILut *lut, ITensor *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Perform table lookup on a given window.
-     *
-     * @param window window Region on which to execute the kernel.
-     */
-    template <class T>
-    void tableLookup(const Window &window);
-    /** Common signature for all the specialised lut functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window);
-    /** Sub function to use for the particular tensor types passed to configure() */
-    TableLookupFunction _func;
-    const ILut         *_lut;
-};
-} // namespace arm_compute
-#endif /* ARM_COMPUTE_NETABLELOOKUPKERNEL_H */
diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp
deleted file mode 100644
index 108f29f377..0000000000
--- a/src/core/NEON/kernels/NEThresholdKernel.cpp
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEThresholdKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/NEON/wrapper/wrapper.h"
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
-    {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    }
-
-    return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Configure kernel window
-    Window win = calculate_max_window(*input, Steps());
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, *input->clone());
-
-    // NEThresholdKernel doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->num_dimensions());
-    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-    return std::make_pair(Status{}, win);
-}
-} // namespace
-
-NEThresholdKernel::NEThresholdKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _info()
-{
-}
-
-void NEThresholdKernel::configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), info));
-
-    _input  = input;
-    _output = output;
-    _info   = info;
-
-    switch(_info.type)
-    {
-        case ThresholdType::BINARY:
-            _func = &NEThresholdKernel::run_binary;
-            break;
-        case ThresholdType::RANGE:
-            _func = &NEThresholdKernel::run_range;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Thresholding type not recognized");
-            break;
-    }
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICPPKernel::configure(win_config.second);
-}
-
-Status NEThresholdKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
-
-    return Status{};
-}
-
-inline void NEThresholdKernel::run_binary(const Window &window)
-{
-    /** Neon vector tag type. */
-    using Type         = uint8_t;
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<Type, wrapper::traits::BitWidth::W128>;
-
-    const int  window_step_x  = 16 / sizeof(Type);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const uint8_t threshold   = _info.threshold;
-    const uint8_t true_value  = _info.true_value;
-    const uint8_t false_value = _info.false_value;
-
-    const auto vthreshold   = wrapper::vdup_n(threshold, ExactTagType{});
-    const auto vtrue_value  = wrapper::vdup_n(true_value, ExactTagType{});
-    const auto vfalse_value = wrapper::vdup_n(false_value, ExactTagType{});
-
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const Type *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<Type *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vdata = wrapper::vloadq(input_ptr + x);
-            const auto vmask = wrapper::vcgt(vdata, vthreshold);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(vmask, vtrue_value, vfalse_value));
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            const Type data   = *(reinterpret_cast<const Type *>(input_ptr + x));
-            *(output_ptr + x) = (data > threshold) ? true_value : false_value;
-        }
-    },
-    input, output);
-}
-
-inline void NEThresholdKernel::run_range(const Window &window)
-{
-    /** Neon vector tag type. */
-    using Type         = uint8_t;
-    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<Type, wrapper::traits::BitWidth::W128>;
-
-    const int  window_step_x  = 16 / sizeof(Type);
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    const uint8_t lower_threshold = _info.threshold;
-    const uint8_t upper_threshold = _info.upper;
-    const uint8_t true_value      = _info.true_value;
-    const uint8_t false_value     = _info.false_value;
-
-    const auto vlower_threshold = wrapper::vdup_n(lower_threshold, ExactTagType{});
-    const auto vupper_threshold = wrapper::vdup_n(upper_threshold, ExactTagType{});
-    const auto vtrue_value      = wrapper::vdup_n(true_value, ExactTagType{});
-    const auto vfalse_value     = wrapper::vdup_n(false_value, ExactTagType{});
-
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
-
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const Type *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<Type *>(output.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const auto vdata = wrapper::vloadq(input_ptr + x);
-            auto       vmask = wrapper::vcle(vdata, vupper_threshold);
-            vmask            = wrapper::vand(wrapper::vcge(vdata, vlower_threshold), vmask);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(vmask, vtrue_value, vfalse_value));
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            const Type data   = *(reinterpret_cast<const Type *>(input_ptr + x));
-            *(output_ptr + x) = (data <= upper_threshold && data >= lower_threshold) ? true_value : false_value;
-        }
-    },
-    input, output);
-}
-
-void NEThresholdKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEThresholdKernel.h b/src/core/NEON/kernels/NEThresholdKernel.h
deleted file mode 100644
index 6b3b3866b0..0000000000
--- a/src/core/NEON/kernels/NEThresholdKernel.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NETHRESHOLDKERNEL_H
-#define ARM_COMPUTE_NETHRESHOLDKERNEL_H
-
-#include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the thresholding kernel */
-class NEThresholdKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEThresholdKernel";
-    }
-    /** Default constructor */
-    NEThresholdKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEThresholdKernel(const NEThresholdKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEThresholdKernel &operator=(const NEThresholdKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEThresholdKernel(NEThresholdKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEThresholdKernel &operator=(NEThresholdKernel &&) = default;
-    /** Default destructor */
-    ~NEThresholdKernel() = default;
-    /** Initialise the kernel's input, output and threshold parameters.
-     *
-     * @param[in]  input  An input tensor. Data type supported: U8
-     * @param[out] output The output tensor. Data type supported: U8.
-     * @param[in]  info   Threshold kernel descriptor
-     */
-    void configure(const ITensor *input, ITensor *output, const ThresholdKernelInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEThresholdKernel
-     *
-     * @param[in] input  Input tensor info. Data type supported: U8
-     * @param[in] output Output tensor info. Data type supported: U8
-     * @param[in] info   Threshold kernel descriptor
-     *
-     * @return A status containing an error code in case of failure
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ThresholdKernelInfo &info);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** run binary thresholding on the given window */
-    void run_binary(const Window &window);
-    /** run range thresholding on the given window */
-    void run_range(const Window &window);
-
-    void (NEThresholdKernel::*_func)(const Window &window);
-
-    const ITensor      *_input;  /**< Input */
-    ITensor            *_output; /**< Output */
-    ThresholdKernelInfo _info;   /**< Threshold descriptor */
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NETHRESHOLDKERNEL_H */
diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp
deleted file mode 100644
index 1ae076153b..0000000000
--- a/src/core/NEON/kernels/NEWarpKernel.cpp
+++ /dev/null
@@ -1,807 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEWarpKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "src/core/AccessWindowStatic.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include <cstddef>
-
-using namespace arm_compute;
-
-namespace
-{
-inline uint8_t nearest_interpolation(const uint8_t *in_ptr, int x, int y, size_t stride)
-{
-    return in_ptr[x + y * stride];
-}
-} // namespace
-
-INEWarpKernel::INEWarpKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _constant_border_value(0), _matrix()
-{
-}
-
-BorderSize INEWarpKernel::border_size() const
-{
-    return BorderSize(1);
-}
-
-void INEWarpKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_func == nullptr);
-
-    (this->*_func)(window);
-}
-
-void INEWarpKernel::configure(const ITensor *input, ITensor *output, const std::array<float, 9> &matrix, BorderMode border_mode, uint8_t constant_border_value)
-{
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
-
-    _matrix                = matrix;
-    _constant_border_value = constant_border_value;
-
-    switch(border_mode)
-    {
-        case BorderMode::UNDEFINED:
-            _func = &INEWarpKernel::warp_undefined;
-            break;
-        case BorderMode::CONSTANT:
-            _func = &INEWarpKernel::warp_constant;
-            break;
-        case BorderMode::REPLICATE:
-            _func = &INEWarpKernel::warp_replicate;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Border mode not supported");
-            break;
-    }
-
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(1U));
-
-    const ValidRegion &input_valid_region = input->info()->valid_region();
-
-    // Reads can occur within the valid region of the input
-    AccessWindowStatic input_access(input->info(),
-                                    input_valid_region.anchor[0] - border_size().left, input_valid_region.anchor[1] - border_size().top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom);
-    AccessWindowHorizontal output_access(output->info(), 0, 1);
-
-    update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
-}
-
-template <InterpolationPolicy interpolation>
-void NEWarpAffineKernel<interpolation>::warp_undefined(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    const int    min_x  = _input->info()->valid_region().anchor[0];
-    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
-    const int    min_y  = _input->info()->valid_region().anchor[1];
-    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    // x0 = M01 * x + M01 * y + M02
-    // y0 = M11 * x + M11 * y + M12
-    const float M00 = _matrix[0];
-    const float M10 = _matrix[1];
-    const float M01 = _matrix[0 + 1 * 2];
-    const float M11 = _matrix[1 + 1 * 2];
-    const float M02 = _matrix[0 + 2 * 2];
-    const float M12 = _matrix[1 + 2 * 2];
-
-    // "M00 * x" and "M10 * x", when x = window.x.start
-    const float start_x0 = M00 * window.x().start();
-    const float start_y0 = M10 * window.x().start();
-
-    // Current row
-    int y_cur  = window.y().start();
-    int z_cur  = window.z().start();
-    int d3_cur = window[3].start();
-    int d4_cur = window[4].start();
-    int d5_cur = window[5].start();
-
-    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
-    float const_x0 = M01 * y_cur + M02;
-    float const_y0 = M11 * y_cur + M12;
-
-    // Affine warp coordinates
-    float x0 = start_x0 + const_x0;
-    float y0 = start_y0 + const_y0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
-        {
-            y_cur  = id.y();
-            z_cur  = id.z();
-            d3_cur = id[3];
-            d4_cur = id[4];
-            d5_cur = id[5];
-
-            const_x0 = M01 * y_cur + M02;
-            const_y0 = M11 * y_cur + M12;
-
-            x0 = start_x0 + const_x0;
-            y0 = start_y0 + const_y0;
-        }
-
-        // Only write to output if x0 and y0 are within the valid region.
-        // Otherwise the read value would be undefined.
-        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-
-        x0 += M00;
-        y0 += M10;
-    },
-    in, out);
-}
-
-template <InterpolationPolicy interpolation>
-void NEWarpAffineKernel<interpolation>::warp_constant(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    const int    min_x  = _input->info()->valid_region().anchor[0];
-    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
-    const int    min_y  = _input->info()->valid_region().anchor[1];
-    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    // x0 = M01 * x + M01 * y + M02
-    // y0 = M11 * x + M11 * y + M12
-    const float M00 = _matrix[0];
-    const float M10 = _matrix[1];
-    const float M01 = _matrix[0 + 1 * 2];
-    const float M11 = _matrix[1 + 1 * 2];
-    const float M02 = _matrix[0 + 2 * 2];
-    const float M12 = _matrix[1 + 2 * 2];
-
-    // "M00 * x" and "M10 * x", when x = window.x.start
-    const float start_x0 = M00 * window.x().start();
-    const float start_y0 = M10 * window.x().start();
-
-    // Current row
-    int y_cur  = window.y().start();
-    int z_cur  = window.z().start();
-    int d3_cur = window[3].start();
-    int d4_cur = window[4].start();
-    int d5_cur = window[5].start();
-
-    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
-    float const_x0 = M01 * y_cur + M02;
-    float const_y0 = M11 * y_cur + M12;
-
-    // Affine warp coordinates
-    float x0 = start_x0 + const_x0;
-    float y0 = start_y0 + const_y0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
-        {
-            y_cur  = id.y();
-            z_cur  = id.z();
-            d3_cur = id[3];
-            d4_cur = id[4];
-            d5_cur = id[5];
-
-            const_x0 = M01 * y_cur + M02;
-            const_y0 = M11 * y_cur + M12;
-
-            x0 = start_x0 + const_x0;
-            y0 = start_y0 + const_y0;
-        }
-
-        // Only use input values if x0 and y0 are within the valid region.
-        // Otherwise write the constant border value.
-        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-        else
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = _constant_border_value;
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                {
-                    const auto xi   = utility::clamp<int>(std::floor(x0), min_x - 1, max_x);
-                    const auto yi   = utility::clamp<int>(std::floor(y0), min_y - 1, max_y);
-                    const auto xi_1 = utility::clamp<int>(std::floor(x0 + 1), min_x - 1, max_x);
-                    const auto yi_1 = utility::clamp<int>(std::floor(y0 + 1), min_y - 1, max_y);
-
-                    const float dx  = x0 - std::floor(x0);
-                    const float dy  = y0 - std::floor(y0);
-                    const float dx1 = 1.0f - dx;
-                    const float dy1 = 1.0f - dy;
-
-                    const float a00 = *(in.ptr() + xi + yi * stride);
-                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
-                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
-                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
-
-                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
-                }
-                break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-
-        x0 += M00;
-        y0 += M10;
-    },
-    in, out);
-}
-
-template <InterpolationPolicy interpolation>
-void NEWarpAffineKernel<interpolation>::warp_replicate(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    const int    min_x  = _input->info()->valid_region().anchor[0];
-    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
-    const int    min_y  = _input->info()->valid_region().anchor[1];
-    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    // Current row
-    int y_cur  = window.y().start();
-    int z_cur  = window.z().start();
-    int d3_cur = window[3].start();
-    int d4_cur = window[4].start();
-    int d5_cur = window[5].start();
-
-    const float M00 = _matrix[0];
-    const float M10 = _matrix[1];
-    const float M01 = _matrix[0 + 1 * 2];
-    const float M11 = _matrix[1 + 1 * 2];
-    const float M02 = _matrix[0 + 2 * 2];
-    const float M12 = _matrix[1 + 2 * 2];
-
-    // "M00 * x" and "M10 * x", when x = window.x.start
-    const float start_x0 = M00 * window.x().start();
-    const float start_y0 = M10 * window.x().start();
-
-    // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing
-    float const_x0 = M01 * y_cur + M02;
-    float const_y0 = M11 * y_cur + M12;
-
-    float x0 = start_x0 + const_x0;
-    float y0 = start_y0 + const_y0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
-        {
-            y_cur  = id.y();
-            z_cur  = id.z();
-            d3_cur = id[3];
-            d4_cur = id[4];
-            d5_cur = id[5];
-
-            const_x0 = M01 * y_cur + M02;
-            const_y0 = M11 * y_cur + M12;
-
-            x0 = start_x0 + const_x0;
-            y0 = start_y0 + const_y0;
-        }
-
-        // Only load from (x0, y0) if the point is within the valid region.
-        // Otherwise load from the edge of the valid region.
-        if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x))
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, x0, y0);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-        else
-        {
-            // Clamp coordinates
-            const auto xi = utility::clamp<int>(std::floor(x0), min_x, max_x - 1);
-            const auto yi = utility::clamp<int>(std::floor(y0), min_y, max_y - 1);
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = *(in.ptr() + xi + yi * stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                {
-                    const auto xi_1 = utility::clamp<int>(std::floor(x0 + 1), min_x, max_x - 1);
-                    const auto yi_1 = utility::clamp<int>(std::floor(y0 + 1), min_y, max_y - 1);
-
-                    const float dx  = x0 - std::floor(x0);
-                    const float dy  = y0 - std::floor(y0);
-                    const float dx1 = 1.0f - dx;
-                    const float dy1 = 1.0f - dy;
-
-                    const float a00 = *(in.ptr() + xi + yi * stride);
-                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
-                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
-                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
-
-                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
-                }
-                break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-
-        x0 += M00;
-        y0 += M10;
-    },
-    in, out);
-}
-
-template <InterpolationPolicy interpolation>
-void NEWarpPerspectiveKernel<interpolation>::warp_undefined(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    const int    min_x  = _input->info()->valid_region().anchor[0];
-    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
-    const int    min_y  = _input->info()->valid_region().anchor[1];
-    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    // x0 = M00 * x + M01 * y + M02
-    // y0 = M10 * x + M11 * y + M12
-    // z0 = M20 * x + M21 * y + M22
-    // xn = x0 / z0
-    // yn = y0 / z0
-    const float M00 = _matrix[0];
-    const float M10 = _matrix[1];
-    const float M20 = _matrix[2];
-    const float M01 = _matrix[0 + 1 * 3];
-    const float M11 = _matrix[1 + 1 * 3];
-    const float M21 = _matrix[2 + 1 * 3];
-    const float M02 = _matrix[0 + 2 * 3];
-    const float M12 = _matrix[1 + 2 * 3];
-    const float M22 = _matrix[2 + 2 * 3];
-
-    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
-    const float start_x0 = M00 * window.x().start();
-    const float start_y0 = M10 * window.x().start();
-    const float start_z0 = M20 * window.x().start();
-
-    // Current row
-    int y_cur  = window.y().start();
-    int z_cur  = window.z().start();
-    int d3_cur = window[3].start();
-    int d4_cur = window[4].start();
-    int d5_cur = window[5].start();
-
-    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
-    float const_x0 = M01 * y_cur + M02;
-    float const_y0 = M11 * y_cur + M12;
-    float const_z0 = M21 * y_cur + M22;
-
-    // Perspective warp coordinates
-    float x0 = start_x0 + const_x0;
-    float y0 = start_y0 + const_y0;
-    float z0 = start_z0 + const_z0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
-        {
-            y_cur  = id.y();
-            z_cur  = id.z();
-            d3_cur = id[3];
-            d4_cur = id[4];
-            d5_cur = id[5];
-
-            const_x0 = M01 * y_cur + M02;
-            const_y0 = M11 * y_cur + M12;
-            const_z0 = M21 * y_cur + M22;
-
-            x0 = start_x0 + const_x0;
-            y0 = start_y0 + const_y0;
-            z0 = start_z0 + const_z0;
-        }
-
-        const float xn = x0 / z0;
-        const float yn = y0 / z0;
-
-        // Only write to output if xn and yn are within the valid region.
-        // Otherwise the read value would be undefined.
-        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-
-        x0 += M00;
-        y0 += M10;
-        z0 += M20;
-    },
-    in, out);
-}
-
-template <InterpolationPolicy interpolation>
-void NEWarpPerspectiveKernel<interpolation>::warp_constant(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    const int    min_x  = _input->info()->valid_region().anchor[0];
-    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
-    const int    min_y  = _input->info()->valid_region().anchor[1];
-    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    // x0 = M00 * x + M01 * y + M02
-    // y0 = M10 * x + M11 * y + M12
-    // z0 = M20 * x + M21 * y + M22
-    // xn = x0 / z0
-    // yn = y0 / z0
-    const float M00 = _matrix[0];
-    const float M10 = _matrix[1];
-    const float M20 = _matrix[2];
-    const float M01 = _matrix[0 + 1 * 3];
-    const float M11 = _matrix[1 + 1 * 3];
-    const float M21 = _matrix[2 + 1 * 3];
-    const float M02 = _matrix[0 + 2 * 3];
-    const float M12 = _matrix[1 + 2 * 3];
-    const float M22 = _matrix[2 + 2 * 3];
-
-    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
-    const float start_x0 = M00 * window.x().start();
-    const float start_y0 = M10 * window.x().start();
-    const float start_z0 = M20 * window.x().start();
-
-    // Current row
-    int y_cur  = window.y().start();
-    int z_cur  = window.z().start();
-    int d3_cur = window[3].start();
-    int d4_cur = window[4].start();
-    int d5_cur = window[5].start();
-
-    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
-    float const_x0 = M01 * y_cur + M02;
-    float const_y0 = M11 * y_cur + M12;
-    float const_z0 = M21 * y_cur + M22;
-
-    // Perspective warp coordinates
-    float x0 = start_x0 + const_x0;
-    float y0 = start_y0 + const_y0;
-    float z0 = start_z0 + const_z0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
-        {
-            y_cur  = id.y();
-            z_cur  = id.z();
-            d3_cur = id[3];
-            d4_cur = id[4];
-            d5_cur = id[5];
-
-            const_x0 = M01 * y_cur + M02;
-            const_y0 = M11 * y_cur + M12;
-            const_z0 = M21 * y_cur + M22;
-
-            x0 = start_x0 + const_x0;
-            y0 = start_y0 + const_y0;
-            z0 = start_z0 + const_z0;
-        }
-
-        const float xn = x0 / z0;
-        const float yn = y0 / z0;
-
-        // Only use input values if xn and yn are within the valid region.
-        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-        else
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = _constant_border_value;
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                {
-                    const auto xi   = utility::clamp<int>(std::floor(xn), min_x - 1, max_x);
-                    const auto yi   = utility::clamp<int>(std::floor(yn), min_y - 1, max_y);
-                    const auto xi_1 = utility::clamp<int>(std::floor(xn + 1), min_x - 1, max_x);
-                    const auto yi_1 = utility::clamp<int>(std::floor(yn + 1), min_y - 1, max_y);
-
-                    const float dx  = xn - std::floor(xn);
-                    const float dy  = yn - std::floor(yn);
-                    const float dx1 = 1.0f - dx;
-                    const float dy1 = 1.0f - dy;
-
-                    const float a00 = *(in.ptr() + xi + yi * stride);
-                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
-                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
-                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
-
-                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
-                }
-                break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-
-        x0 += M00;
-        y0 += M10;
-        z0 += M20;
-    },
-    in, out);
-}
-
-template <InterpolationPolicy interpolation>
-void NEWarpPerspectiveKernel<interpolation>::warp_replicate(const Window &window)
-{
-    // Don't increment in X and Y direction for the input tensor
-    // A pointer to the start of this plane is needed as base for the precomputed offsets
-    Window win_in(window);
-    win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-
-    Iterator in(_input, win_in);
-    Iterator out(_output, window);
-
-    const int    min_x  = _input->info()->valid_region().anchor[0];
-    const int    max_x  = min_x + _input->info()->valid_region().shape[0];
-    const int    min_y  = _input->info()->valid_region().anchor[1];
-    const int    max_y  = min_y + _input->info()->valid_region().shape[1];
-    const size_t stride = _input->info()->strides_in_bytes()[1];
-
-    // Current row
-    int y_cur  = window.y().start();
-    int z_cur  = window.z().start();
-    int d3_cur = window[3].start();
-    int d4_cur = window[4].start();
-    int d5_cur = window[5].start();
-
-    // x0 = M00 * x + M01 * y + M02
-    // y0 = M10 * x + M11 * y + M12
-    // z0 = M20 * x + M21 * y + M22
-    // xn = x0 / z0
-    // yn = y0 / z0
-    const float M00 = _matrix[0];
-    const float M10 = _matrix[1];
-    const float M20 = _matrix[2];
-    const float M01 = _matrix[0 + 1 * 3];
-    const float M11 = _matrix[1 + 1 * 3];
-    const float M21 = _matrix[2 + 1 * 3];
-    const float M02 = _matrix[0 + 2 * 3];
-    const float M12 = _matrix[1 + 2 * 3];
-    const float M22 = _matrix[2 + 2 * 3];
-
-    // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start
-    const float start_x0 = M00 * window.x().start();
-    const float start_y0 = M10 * window.x().start();
-    const float start_z0 = M20 * window.x().start();
-
-    // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing
-    float const_x0 = M01 * y_cur + M02;
-    float const_y0 = M11 * y_cur + M12;
-    float const_z0 = M21 * y_cur + M22;
-
-    // Perspective warp coordinates
-    float x0 = start_x0 + const_x0;
-    float y0 = start_y0 + const_y0;
-    float z0 = start_z0 + const_z0;
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0
-        if((y_cur != id.y()) || (z_cur != id.z()) || (d3_cur != id[3]) || (d4_cur != id[4]) || (d5_cur != id[5]))
-        {
-            y_cur  = id.y();
-            z_cur  = id.z();
-            d3_cur = id[3];
-            d4_cur = id[4];
-            d5_cur = id[5];
-
-            const_x0 = M01 * y_cur + M02;
-            const_y0 = M11 * y_cur + M12;
-            const_z0 = M21 * y_cur + M22;
-
-            x0 = start_x0 + const_x0;
-            y0 = start_y0 + const_y0;
-            z0 = start_z0 + const_z0;
-        }
-
-        const float xn = x0 / z0;
-        const float yn = y0 / z0;
-
-        // Only load from (x0, y0) if the point is within the valid region.
-        if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x))
-        {
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                    *out.ptr() = scale_helpers::pixel_bilinear_c1(in.ptr(), stride, xn, yn);
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-        else
-        {
-            // Clamp coordinates
-            const auto xi = utility::clamp<int>(std::floor(xn), min_x, max_x - 1);
-            const auto yi = utility::clamp<int>(std::floor(yn), min_y, max_y - 1);
-            switch(interpolation)
-            {
-                case InterpolationPolicy::NEAREST_NEIGHBOR:
-                    *out.ptr() = *(in.ptr() + xi + yi * stride);
-                    break;
-                case InterpolationPolicy::BILINEAR:
-                {
-                    const auto xi_1 = utility::clamp<int>(std::floor(xn + 1), min_x, max_x - 1);
-                    const auto yi_1 = utility::clamp<int>(std::floor(yn + 1), min_y, max_y - 1);
-
-                    const float dx  = xn - std::floor(xn);
-                    const float dy  = yn - std::floor(yn);
-                    const float dx1 = 1.0f - dx;
-                    const float dy1 = 1.0f - dy;
-
-                    const float a00 = *(in.ptr() + xi + yi * stride);
-                    const float a01 = *(in.ptr() + xi_1 + yi * stride);
-                    const float a10 = *(in.ptr() + xi + yi_1 * stride);
-                    const float a11 = *(in.ptr() + xi_1 + yi_1 * stride);
-
-                    *out.ptr() = a00 * (dx1 * dy1) + a01 * (dx * dy1) + a10 * (dx1 * dy) + a11 * (dx * dy);
-                }
-                break;
-                default:
-                    ARM_COMPUTE_ERROR("Interpolation not supported");
-            }
-        }
-
-        x0 += M00;
-        y0 += M10;
-        z0 += M20;
-    },
-    in, out);
-}
-
-template class arm_compute::NEWarpAffineKernel<InterpolationPolicy::NEAREST_NEIGHBOR>;
-template class arm_compute::NEWarpAffineKernel<InterpolationPolicy::BILINEAR>;
-template class arm_compute::NEWarpPerspectiveKernel<InterpolationPolicy::NEAREST_NEIGHBOR>;
-template class arm_compute::NEWarpPerspectiveKernel<InterpolationPolicy::BILINEAR>;
diff --git a/src/core/NEON/kernels/NEWarpKernel.h b/src/core/NEON/kernels/NEWarpKernel.h
deleted file mode 100644
index 2c4cb55e3c..0000000000
--- a/src/core/NEON/kernels/NEWarpKernel.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEWARPKERNEL_H
-#define ARM_COMPUTE_NEWARPKERNEL_H
-
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
-
-#include <array>
-#include <cstdint>
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for warp affine and warp perspective */
-class INEWarpKernel : public INEKernel
-{
-public:
-    /** Default constructor */
-    INEWarpKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWarpKernel(const INEWarpKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    INEWarpKernel &operator=(const INEWarpKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    INEWarpKernel(INEWarpKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    INEWarpKernel &operator=(INEWarpKernel &&) = default;
-    /** Default destructor */
-    ~INEWarpKernel() = default;
-    /** Initialise the kernel's input, output and border mode.
-     *
-     * @param[in]  input                 Source tensor. Data type supported: U8.
-     * @param[out] output                Destination tensor. Data type supported: U8.
-     * @param[in]  matrix                The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float.
-     *                                   The matrix argument requires 9 values, for the affine case the last 3 values are ignored.
-     * @param[in]  border_mode           Strategy to use for borders
-     * @param[in]  constant_border_value Constant value used for filling the border.
-     */
-    virtual void configure(const ITensor *input, ITensor *output, const std::array<float, 9> &matrix, BorderMode border_mode, uint8_t constant_border_value);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-    // Inherited methods overridden:
-    BorderSize border_size() const override;
-
-protected:
-    /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_undefined(const Window &window) = 0;
-    /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_constant(const Window &window) = 0;
-    /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    virtual void warp_replicate(const Window &window) = 0;
-    /** Common signature for all the specialised warp functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    void (INEWarpKernel::*_func)(const Window &window);
-
-    const ITensor *_input;                 /**< Input Tensor */
-    ITensor       *_output;                /**< Output Tensor */
-    uint8_t        _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */
-    std::array<float, 9> _matrix;          /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */
-};
-
-/** Template interface for the kernel to compute warp affine
- *
- */
-template <InterpolationPolicy interpolation>
-class NEWarpAffineKernel : public INEWarpKernel
-{
-private:
-    const char *name() const override
-    {
-        return "NEWarpAffineKernel";
-    }
-    // Inherited methods overridden:
-    void warp_undefined(const Window &window) override;
-    void warp_constant(const Window &window) override;
-    void warp_replicate(const Window &window) override;
-};
-
-/** Template interface for the kernel to compute warp perspective
- *
- */
-template <InterpolationPolicy interpolation>
-class NEWarpPerspectiveKernel : public INEWarpKernel
-{
-private:
-    const char *name() const override
-    {
-        return "NEWarpPerspectiveKernel";
-    }
-    // Inherited methods overridden:
-    void warp_undefined(const Window &window) override;
-    void warp_constant(const Window &window) override;
-    void warp_replicate(const Window &window) override;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWARPKERNEL_H */
-- 
cgit v1.2.1