11 files changed, 467 insertions, 670 deletions
diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index e4c3b7793f..ef0518ed3d 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -26,93 +26,139 @@
 
 #include "arm_compute/core/Error.h"
 
-#ifndef BARE_METAL
-#include <sched.h>
-#endif /* defined(BARE_METAL) */
+#include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 
-using namespace arm_compute;
-
-void CPUInfo::set_fp16(const bool fp16)
+namespace arm_compute
 {
-    _fp16 = fp16;
-}
+struct CPUInfo::Impl
+{
+    cpuinfo::CpuInfo info{};
+    unsigned int     L1_cache_size = 32768;
+    unsigned int     L2_cache_size = 262144;
+};
 
-void CPUInfo::set_dotprod(const bool dotprod)
+CPUInfo &CPUInfo::get()
 {
-    _dotprod = dotprod;
+    static CPUInfo _cpuinfo;
+    return _cpuinfo;
 }
 
-void CPUInfo::set_cpu_model(unsigned int cpuid, CPUModel model)
+CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>())
 {
-    ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size());
-    if(_percpu.size() > cpuid)
-    {
-        _percpu[cpuid] = model;
-    }
+    _impl->info = cpuinfo::CpuInfo::build();
 }
 
+CPUInfo::~CPUInfo() = default;
+
 unsigned int CPUInfo::get_cpu_num() const
 {
-    return _percpu.size();
+    return _impl->info.num_cpus();
 }
+
 bool CPUInfo::has_fp16() const
 {
-    return _fp16;
+    return _impl->info.has_fp16();
+}
+
+bool CPUInfo::has_bf16() const
+{
+    return _impl->info.has_bf16();
+}
+
+bool CPUInfo::has_svebf16() const
+{
+    return _impl->info.has_svebf16();
 }
 
 bool CPUInfo::has_dotprod() const
 {
-    return _dotprod;
+    return _impl->info.has_dotprod();
 }
 
-CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
+bool CPUInfo::has_svef32mm() const
 {
-    if(cpuid < _percpu.size())
-    {
-        return _percpu[cpuid];
-    }
-    return CPUModel::GENERIC;
+    return _impl->info.has_svef32mm();
 }
 
-unsigned int CPUInfo::get_L1_cache_size() const
+bool CPUInfo::has_i8mm() const
 {
-    return _L1_cache_size;
+    return _impl->info.has_i8mm();
 }
 
-void CPUInfo::set_L1_cache_size(unsigned int size)
+bool CPUInfo::has_svei8mm() const
 {
-    _L1_cache_size = size;
+    return _impl->info.has_svei8mm();
 }
 
-unsigned int CPUInfo::get_L2_cache_size() const
+bool CPUInfo::has_sve() const
 {
-    return _L2_cache_size;
+    return _impl->info.has_sve();
 }
 
-void CPUInfo::set_L2_cache_size(unsigned int size)
+bool CPUInfo::has_sve2() const
 {
-    _L2_cache_size = size;
+    return _impl->info.has_sve2();
 }
 
-void CPUInfo::set_cpu_num(unsigned int cpu_count)
+bool CPUInfo::has_sme() const
 {
-    _percpu.resize(cpu_count);
+    return _impl->info.has_sme();
 }
 
-CPUInfo::CPUInfo()
-    : _percpu(1)
+bool CPUInfo::has_sme2() const
 {
-    // The core library knows nothing about the CPUs so we set only 1 CPU to be generic.
-    // The runtime NESCheduler will initialise this vector with the correct CPU models.
-    // See void detect_cpus_configuration(CPUInfo &cpuinfo) in CPPUtils.h
-    _percpu[0] = CPUModel::GENERIC;
+    return _impl->info.has_sme2();
 }
 
 CPUModel CPUInfo::get_cpu_model() const
 {
-#if defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__))
-    return get_cpu_model(0);
-#else  /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
-    return get_cpu_model(sched_getcpu());
-#endif /* defined(BARE_METAL) || (!defined(__arm__) && !defined(__aarch64__)) */
+    return _impl->info.cpu_model();
+}
+
+CPUModel CPUInfo::get_cpu_model(unsigned int cpuid) const
+{
+    return _impl->info.cpu_model(cpuid);
+}
+
+cpuinfo::CpuIsaInfo CPUInfo::get_isa() const
+{
+    return _impl->info.isa();
+}
+
+unsigned int CPUInfo::get_L1_cache_size() const
+{
+    return _impl->L1_cache_size;
+}
+
+unsigned int CPUInfo::get_L2_cache_size() const
+{
+    return _impl->L2_cache_size;
+}
+
+unsigned long CPUInfo::get_sme2_vector_length() const
+{
+#ifdef ARM_COMPUTE_ENABLE_SME2
+    return arm_gemm::utils::sme::get_vector_length<int8_t>();
+#else  // ARM_COMPUTE_ENABLE_SME2
+    return 0;
+#endif // ARM_COMPUTE_ENABLE_SME2
+}
+bool CPUInfo::cpu_has_little_mid_big() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.has_little_mid_big();
+#else  /* defined(__ANDROID__) */
+    return false;
+#endif /* defined(__ANDROID__) */
+}
+unsigned int CPUInfo::get_cpu_num_excluding_little() const
+{
+#if defined(__ANDROID__)
+    return _impl->info.not_little_num_cpus();
+#else  /* defined(__ANDROID__) */
+    return get_cpu_num();
+#endif /* defined(__ANDROID__) */
 }
+} // namespace arm_compute
diff --git a/src/core/CPP/ICPPSimpleKernel.cpp b/src/core/CPP/ICPPSimpleKernel.cpp
deleted file mode 100644
index 01fb016ffe..0000000000
--- a/src/core/CPP/ICPPSimpleKernel.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CPP/ICPPSimpleKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-
-namespace arm_compute
-{
-namespace
-{
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
-                                                        bool border_undefined, const arm_compute::BorderSize &border_size)
-{
-    // Configure kernel window
-    Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration), border_undefined, border_size);
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
-    bool window_changed = update_window_and_padding(win, input_access, output_access);
-
-    output_access.set_valid_region(win, input->valid_region(), border_undefined, border_size);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-} // namespace
-
-ICPPSimpleKernel::ICPPSimpleKernel()
-    : _input{ nullptr }, _output{ nullptr }
-{
-}
-
-void ICPPSimpleKernel::configure(const ITensor *input, ITensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
-{
-    _input  = input;
-    _output = output;
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), num_elems_processed_per_iteration, border_undefined, border_size);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICPPKernel::configure(win_config.second);
-}
-
-Status ICPPSimpleKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_elems_processed_per_iteration,
-                                  bool border_undefined, const arm_compute::BorderSize &border_size)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), num_elems_processed_per_iteration, border_undefined, border_size).first);
-    return Status{};
-}
-
-} // namespace arm_compute
-\ No newline at end of file
diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
new file mode 100644
index 0000000000..fe253508cf
--- /dev/null
+++ b/src/core/CPP/Validate.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPP_VALIDATE_H
+#define ARM_COMPUTE_CPP_VALIDATE_H
+
+#include "arm_compute/core/CPP/CPPTypes.h"
+#include "arm_compute/core/Validate.h"
+
+namespace arm_compute
+{
+/** Return an error if the data type of the passed tensor info is FP16 and FP16 support is not compiled in.
+ *
+ * @param[in] function    Function in which the error occurred.
+ * @param[in] file        Name of the file where the error occurred.
+ * @param[in] line        Line on which the error occurred.
+ * @param[in] tensor_info Tensor info to validate.
+ *
+ * @return Status
+ */
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
+{
+    bool fp16_kernels_enabled = false;
+#if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS)
+    fp16_kernels_enabled = true;
+#endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */
+
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function,
+        file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+    return Status{};
+}
+
+/** Return an error if the data type of the passed tensor info is BFLOAT16 and BFLOAT16 support is not compiled in.
+ *
+ * @param[in] function    Function in which the error occurred.
+ * @param[in] file        Name of the file where the error occurred.
+ * @param[in] line        Line on which the error occurred.
+ * @param[in] tensor_info Tensor info to validate.
+ *
+ * @return Status
+ */
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
+{
+    bool bf16_kernels_enabled = false;
+#if defined(ARM_COMPUTE_ENABLE_BF16)
+    bf16_kernels_enabled = true;
+#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
+
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
+        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+    return Status{};
+}
+
+/** Return an error if the data type of the passed tensor is FP16 and FP16 support is not compiled in.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor to validate.
+ *
+ * @return Status
+ */
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
+    return Status{};
+}
+
+/** Return an error if the data type of the passed tensor is BFLOAT16 and BFLOAT16 support is not compiled in.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor to validate.
+ *
+ * @return Status
+ */
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));
+    return Status{};
+}
+
+#define ARM_COMPUTE_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(__func__, __FILE__, __LINE__, tensor))
+
+#define ARM_COMPUTE_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
+
+#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor) \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(__func__, __FILE__, __LINE__, tensor))
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPP_VALIDATE_H */
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 3058a0c977..02686eb4f6 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020, 2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,9 +23,10 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <algorithm>
 #include <cmath>
 
@@ -34,7 +35,11 @@ namespace arm_compute
 namespace
 {
 template <typename T>
-std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> SoftNMS(const ITensor               *proposals,
+                         std::vector<std::vector<T>> &scores_in,
+                         std::vector<int>             inds,
+                         const BoxNMSLimitInfo       &info,
+                         int                          class_id)
 {
     std::vector<int> keep;
     const int        proposals_width = proposals->info()->dimension(1);
@@ -45,7 +50,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -56,13 +61,13 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
     // Note: Soft NMS scores have already been initialized with input scores
 
-    while(!inds.empty())
+    while (!inds.empty())
     {
         // Find proposal with max score among remaining proposals
         int max_pos = 0;
-        for(unsigned int i = 1; i < inds.size(); ++i)
+        for (unsigned int i = 1; i < inds.size(); ++i)
         {
-            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
             {
                 max_pos = i;
             }
@@ -75,7 +80,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
         inds.erase(inds.begin());
 
         std::vector<int> sorted_indices_temp;
-        for(auto idx : inds)
+        for (auto idx : inds)
         {
             const auto xx1 = std::max(x1[idx], x1[element]);
             const auto yy1 = std::max(y1[idx], y1[element]);
@@ -89,7 +94,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Update scores based on computed IoU, overlap threshold and NMS method
             T weight;
-            switch(info.soft_nms_method())
+            switch (info.soft_nms_method())
             {
                 case NMSType::LINEAR:
                     weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
@@ -106,7 +111,7 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 
             // Discard boxes with new scores below min threshold and update pending indices
             scores_in[class_id][idx] *= weight;
-            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
             {
                 sorted_indices_temp.push_back(idx);
             }
@@ -118,7 +123,10 @@ std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &
 }
 
 template <typename T>
-std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> NonMaximaSuppression(const ITensor         *proposals,
+                                      std::vector<int>       sorted_indices,
+                                      const BoxNMSLimitInfo &info,
+                                      int                    class_id)
 {
     std::vector<int> keep;
 
@@ -130,7 +138,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -139,7 +147,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
     }
 
-    while(!sorted_indices.empty())
+    while (!sorted_indices.empty())
     {
         int i = sorted_indices.at(0);
         keep.push_back(i);
@@ -148,7 +156,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
         std::vector<int> new_indices;
         sorted_indices_temp.erase(sorted_indices_temp.begin());
 
-        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
         {
             const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
             const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
@@ -163,8 +171,9 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
             const float ctr_y = yy1 + (h / 2);
 
             // If suppress_size is specified, filter the boxes based on their size and position
-            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
-            if(ovr <= info.nms() && keep_size)
+            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() &&
+                                                             ctr_x < info.im_width() && ctr_y < info.im_height());
+            if (ovr <= info.nms() && keep_size)
             {
                 new_indices.push_back(j);
             }
@@ -172,7 +181,7 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 
         const unsigned int new_indices_size = new_indices.size();
         std::vector<int>   new_sorted_indices(new_indices_size);
-        for(unsigned int i = 0; i < new_indices_size; ++i)
+        for (unsigned int i = 0; i < new_indices_size; ++i)
         {
             new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
         }
@@ -184,7 +193,15 @@ std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int>
 } // namespace
 
 CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
-    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+    : _scores_in(nullptr),
+      _boxes_in(nullptr),
+      _batch_splits_in(nullptr),
+      _scores_out(nullptr),
+      _boxes_out(nullptr),
+      _classes(nullptr),
+      _batch_splits_out(nullptr),
+      _keeps(nullptr),
+      _keeps_size(nullptr),
       _info()
 {
 }
@@ -197,7 +214,7 @@ bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
 template <typename T>
 void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 {
-    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
     const int                     num_classes  = _scores_in->info()->dimension(0);
     const int                     scores_count = _scores_in->info()->dimension(1);
     std::vector<int>              total_keep_per_batch(batch_size);
@@ -205,53 +222,48 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
     int                           total_keep_count = 0;
 
     std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
-    for(int i = 0; i < scores_count; ++i)
+    for (int i = 0; i < scores_count; ++i)
     {
-        for(int j = 0; j < num_classes; ++j)
+        for (int j = 0; j < num_classes; ++j)
         {
             in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
         }
     }
 
-    int offset        = 0;
     int cur_start_idx = 0;
-    for(int b = 0; b < batch_size; ++b)
+    for (int b = 0; b < batch_size; ++b)
     {
-        const int num_boxes = _batch_splits_in == nullptr ? 1 : static_cast<int>(*reinterpret_cast<T *>(_batch_splits_in->ptr_to_element(Coordinates(b))));
         // Skip first class if there is more than 1 except if the number of classes is 1.
         const int j_start = (num_classes == 1 ? 0 : 1);
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
             std::vector<T>   cur_scores(scores_count);
             std::vector<int> inds;
-            for(int i = 0; i < scores_count; ++i)
+            for (int i = 0; i < scores_count; ++i)
             {
                 const T score = in_scores[j][i];
                 cur_scores[i] = score;
 
-                if(score > _info.score_thresh())
+                if (score > _info.score_thresh())
                 {
                     inds.push_back(i);
                 }
             }
-            if(_info.soft_nms_enabled())
+            if (_info.soft_nms_enabled())
             {
                 keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
             }
             else
             {
                 std::sort(inds.data(), inds.data() + inds.size(),
-                          [&cur_scores](int lhs, int rhs)
-                {
-                    return cur_scores[lhs] > cur_scores[rhs];
-                });
+                          [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; });
 
                 keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
             }
             total_keep_count += keeps[j].size();
         }
 
-        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
         {
             // merge all scores (represented by indices) together and sort
             auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
@@ -259,10 +271,10 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
                 std::vector<T> ret(total_keep_count);
 
                 int ret_idx = 0;
-                for(unsigned int i = 1; i < keeps.size(); ++i)
+                for (unsigned int i = 1; i < keeps.size(); ++i)
                 {
                     auto &cur_keep = keeps[i];
-                    for(auto &ckv : cur_keep)
+                    for (auto &ckv : cur_keep)
                     {
                         ret[ret_idx++] = in_scores[i][ckv];
                     }
@@ -275,13 +287,13 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
             auto    all_scores_sorted = get_all_scores_sorted();
             const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
-            for(int j = 1; j < num_classes; ++j)
+            for (int j = 1; j < num_classes; ++j)
             {
                 auto            &cur_keep = keeps[j];
                 std::vector<int> new_keeps_j;
-                for(auto &k : cur_keep)
+                for (auto &k : cur_keep)
                 {
-                    if(in_scores[j][k] >= image_thresh)
+                    if (in_scores[j][k] >= image_thresh)
                     {
                         new_keeps_j.push_back(k);
                     }
@@ -295,59 +307,78 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 
         // Write results
         int cur_out_idx = 0;
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
-            auto     &cur_keep        = keeps[j];
-            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
-
-            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            auto &cur_keep = keeps[j];
+            auto  cur_out_scores =
+                reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto cur_out_classes =
+                reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column = (cur_start_idx + cur_out_idx) * 4;
+
+            for (unsigned int k = 0; k < cur_keep.size(); ++k)
             {
-                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
-                cur_out_classes[k]    = static_cast<T>(j);
-                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
-                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
-                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
-                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
-                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
-                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
-                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
-                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+                cur_out_scores[k]  = in_scores[j][cur_keep[k]];
+                cur_out_classes[k] = static_cast<T>(j);
+                auto cur_out_box_row0 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
             }
 
             cur_out_idx += cur_keep.size();
         }
 
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             cur_out_idx = 0;
-            for(int j = 0; j < num_classes; ++j)
+            for (int j = 0; j < num_classes; ++j)
             {
-                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                for (unsigned int i = 0; i < keeps[j].size(); ++i)
                 {
-                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) =
+                        static_cast<T>(keeps[j].at(i));
                 }
-                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) =
+                    keeps[j].size();
                 cur_out_idx += keeps[j].size();
             }
         }
 
-        offset += num_boxes;
         cur_start_idx += total_keep_count;
     }
 
-    if(_batch_splits_out != nullptr)
+    if (_batch_splits_out != nullptr)
     {
-        for(int b = 0; b < batch_size; ++b)
+        for (int b = 0; b < batch_size; ++b)
         {
             *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
         }
     }
 }
 
-void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor        *scores_in,
+                                                          const ITensor        *boxes_in,
+                                                          const ITensor        *batch_splits_in,
+                                                          ITensor              *scores_out,
+                                                          ITensor              *boxes_out,
+                                                          ITensor              *classes,
+                                                          ITensor              *batch_splits_out,
+                                                          ITensor              *keeps,
+                                                          ITensor              *keeps_size,
+                                                          const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
@@ -355,25 +386,28 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_
     const unsigned int num_classes = scores_in->info()->dimension(0);
 
     ARM_COMPUTE_UNUSED(num_classes);
-    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
-    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0),
+                             "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1),
+                             "Input scores and input boxes must have the same number of rows");
 
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
-    if(keeps != nullptr)
+    if (keeps != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr,
+                                 "keeps_size cannot be nullptr if keeps has to be provided as output");
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
         ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
         ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
     }
-    if(batch_splits_in != nullptr)
+    if (batch_splits_in != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
     }
-    if(batch_splits_out != nullptr)
+    if (batch_splits_out != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
     }
@@ -402,7 +436,7 @@ void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
 
-    switch(_scores_in->info()->data_type())
+    switch (_scores_in->info()->data_type())
     {
         case DataType::F32:
             run_nmslimit<float>();
diff --git a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp b/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
deleted file mode 100644
index 739f389fdb..0000000000
--- a/src/core/CPP/kernels/CPPCornerCandidatesKernel.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include "support/Mutex.h"
-
-using namespace arm_compute;
-
-namespace
-{
-inline void check_corner(float x, float y, float strength, InternalKeypoint *output, int32_t *num_corner_candidates, arm_compute::Mutex *corner_candidates_mutex)
-{
-    if(strength != 0.0f)
-    {
-        /* Set index and update num_corner_candidate */
-        arm_compute::unique_lock<arm_compute::Mutex> lock(*corner_candidates_mutex);
-
-        const int32_t idx = *num_corner_candidates;
-
-        *num_corner_candidates += 1;
-
-        lock.unlock();
-
-        /* Add keypoint */
-        output[idx] = std::make_tuple(x, y, strength);
-    }
-}
-
-inline void corner_candidates(const float *__restrict input, InternalKeypoint *__restrict output, int32_t x, int32_t y, int32_t *num_corner_candidates, arm_compute::Mutex *corner_candidates_mutex)
-{
-    check_corner(x, y, *input, output, num_corner_candidates, corner_candidates_mutex);
-}
-} // namespace
-
-bool keypoint_compare(const InternalKeypoint &lhs, const InternalKeypoint &rhs)
-{
-    return std::get<2>(lhs) > std::get<2>(rhs);
-}
-
-CPPCornerCandidatesKernel::CPPCornerCandidatesKernel()
-    : _num_corner_candidates(nullptr), _corner_candidates_mutex(), _input(nullptr), _output(nullptr)
-{
-}
-
-void CPPCornerCandidatesKernel::configure(const IImage *input, InternalKeypoint *output, int32_t *num_corner_candidates)
-{
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == num_corner_candidates);
-    ARM_COMPUTE_ERROR_ON(*num_corner_candidates != 0);
-
-    _input                 = input;
-    _output                = output;
-    _num_corner_candidates = num_corner_candidates;
-
-    const unsigned int num_elems_processed_per_iteration = 1;
-
-    // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration));
-
-    INEKernel::configure(win);
-}
-
-void CPPCornerCandidatesKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    Iterator input(_input, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        corner_candidates(reinterpret_cast<float *>(input.ptr()), &_output[0], id.x(), id.y(), _num_corner_candidates, &_corner_candidates_mutex);
-    },
-    input);
-}
diff --git a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp b/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
deleted file mode 100644
index 5037ac55cb..0000000000
--- a/src/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-
-#include <algorithm>
-#include <cmath>
-
-using namespace arm_compute;
-
-namespace
-{
-bool compare_detection_window(const DetectionWindow &lhs, const DetectionWindow &rhs)
-{
-    if(lhs.idx_class < rhs.idx_class)
-    {
-        return true;
-    }
-    if(rhs.idx_class < lhs.idx_class)
-    {
-        return false;
-    }
-
-    // idx_classes are equal so compare by score
-    if(lhs.score > rhs.score)
-    {
-        return true;
-    }
-    if(rhs.score > lhs.score)
-    {
-        return false;
-    }
-
-    return false;
-}
-} // namespace
-
-CPPDetectionWindowNonMaximaSuppressionKernel::CPPDetectionWindowNonMaximaSuppressionKernel()
-    : _input_output(nullptr), _min_distance(0.0f)
-{
-}
-
-bool CPPDetectionWindowNonMaximaSuppressionKernel::is_parallelisable() const
-{
-    return false;
-}
-
-void CPPDetectionWindowNonMaximaSuppressionKernel::configure(IDetectionWindowArray *input_output, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == input_output);
-
-    _input_output = input_output;
-    _min_distance = min_distance;
-
-    IKernel::configure(Window()); // Default 1 iteration window
-}
-
-void CPPDetectionWindowNonMaximaSuppressionKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_input_output->buffer() == nullptr);
-
-    const size_t num_candidates = _input_output->num_values();
-    size_t       num_detections = 0;
-
-    // Sort list of candidates by idx_class and then score
-    std::sort(_input_output->buffer(), _input_output->buffer() + num_candidates, compare_detection_window);
-
-    const float min_distance_pow2 = _min_distance * _min_distance;
-
-    // Euclidean distance
-    for(size_t i = 0; i < num_candidates; ++i)
-    {
-        if(0.0f != _input_output->at(i).score)
-        {
-            DetectionWindow cur;
-            cur.x         = _input_output->at(i).x;
-            cur.y         = _input_output->at(i).y;
-            cur.width     = _input_output->at(i).width;
-            cur.height    = _input_output->at(i).height;
-            cur.idx_class = _input_output->at(i).idx_class;
-            cur.score     = _input_output->at(i).score;
-
-            // Store window
-            _input_output->at(num_detections) = cur;
-
-            ++num_detections;
-
-            const float xc = cur.x + cur.width * 0.5f;
-            const float yc = cur.y + cur.height * 0.5f;
-
-            for(size_t k = i + 1; k < (num_candidates) && (cur.idx_class == _input_output->at(k).idx_class); ++k)
-            {
-                const float xn = _input_output->at(k).x + _input_output->at(k).width * 0.5f;
-                const float yn = _input_output->at(k).y + _input_output->at(k).height * 0.5f;
-
-                const float dx = std::fabs(xn - xc);
-                const float dy = std::fabs(yn - yc);
-
-                if(dx < _min_distance && dy < _min_distance)
-                {
-                    const float d = dx * dx + dy * dy;
-
-                    if(d < min_distance_pow2)
-                    {
-                        // Invalidate detection window
-                        _input_output->at(k).score = 0.0f;
-                    }
-                }
-            }
-        }
-    }
-
-    _input_output->resize(num_detections);
-}
diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index 7ea59ba65b..1224ec14a7 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,25 +23,34 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPNonMaximumSuppressionKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <algorithm>
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                          const float score_threshold, const float iou_threshold)
+Status validate_arguments(const ITensorInfo *bboxes,
+                          const ITensorInfo *scores,
+                          const ITensorInfo *output_indices,
+                          unsigned int       max_output_size,
+                          const float        score_threshold,
+                          const float        iou_threshold)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2,
+                                    "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1,
+                                    "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1,
+                                    "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
@@ -53,15 +62,26 @@ Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores,
 } // namespace
 
 CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
-    : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
+    : _input_bboxes(nullptr),
+      _input_scores(nullptr),
+      _output_indices(nullptr),
+      _max_output_size(0),
+      _score_threshold(0.f),
+      _iou_threshold(0.f),
+      _num_boxes(0)
 {
 }
 
-void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
-                                               unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes,
+                                               const ITensor *input_scores,
+                                               ITensor       *output_indices,
+                                               unsigned int   max_output_size,
+                                               const float    score_threshold,
+                                               const float    iou_threshold)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(),
+                                                  max_output_size, score_threshold, iou_threshold));
 
     auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
 
@@ -80,10 +100,15 @@ void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, cons
     ICPPKernel::configure(win);
 }
 
-Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
-                                                unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes,
+                                                const ITensorInfo *scores,
+                                                const ITensorInfo *output_indices,
+                                                unsigned int       max_output_size,
+                                                const float        score_threshold,
+                                                const float        iou_threshold)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
     return Status{};
 }
 
@@ -97,10 +122,10 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     // Auxiliary tensors
     std::vector<int>   indices_above_thd;
     std::vector<float> scores_above_thd;
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
-        if(score_i >= _score_threshold)
+        if (score_i >= _score_threshold)
         {
             scores_above_thd.emplace_back(score_i);
             indices_above_thd.emplace_back(i);
@@ -112,12 +137,9 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<unsigned int> sorted_indices;
     sorted_indices.resize(num_above_thd);
     std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
-    std::sort(std::begin(sorted_indices),
-              std::end(sorted_indices),
+    std::sort(std::begin(sorted_indices), std::end(sorted_indices),
               [&](unsigned int first, unsigned int second)
-    {
-        return scores_above_thd[first] > scores_above_thd[second];
-    });
+              { return scores_above_thd[first] > scores_above_thd[second]; });
 
     // Number of output is the minimum between max_detection and the scores above the threshold
     const unsigned int num_output = std::min(_max_output_size, num_above_thd);
@@ -125,19 +147,20 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     std::vector<bool>  visited(num_above_thd, false);
 
     // Keep only boxes with small IoU
-    for(unsigned int i = 0; i < num_above_thd; ++i)
+    for (unsigned int i = 0; i < num_above_thd; ++i)
     {
         // Check if the output is full
-        if(output_idx >= num_output)
+        if (output_idx >= num_output)
         {
             break;
         }
 
         // Check if it was already visited, if not add it to the output and update the indices counter
-        if(!visited[sorted_indices[i]])
+        if (!visited[sorted_indices[i]])
         {
-            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
-            visited[sorted_indices[i]]                                                           = true;
+            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) =
+                indices_above_thd[sorted_indices[i]];
+            visited[sorted_indices[i]] = true;
             ++output_idx;
         }
         else
@@ -146,28 +169,36 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
         }
 
         // Once added one element at the output check if the next ones overlap and can be skipped
-        for(unsigned int j = i + 1; j < num_above_thd; ++j)
+        for (unsigned int j = i + 1; j < num_above_thd; ++j)
         {
-            if(!visited[sorted_indices[j]])
+            if (!visited[sorted_indices[j]])
             {
                 // Calculate IoU
                 const unsigned int i_index = indices_above_thd[sorted_indices[i]];
                 const unsigned int j_index = indices_above_thd[sorted_indices[j]];
                 // Box-corner format: xmin, ymin, xmax, ymax
-                const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
-                const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
-                const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
-                const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
-
-                const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
-                const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
-                const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
-                const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+                const auto box_i_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+                const auto box_i_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+                const auto box_i_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+                const auto box_i_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+
+                const auto box_j_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+                const auto box_j_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+                const auto box_j_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+                const auto box_j_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
 
                 const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
                 const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
                 float       overlap;
-                if(area_i <= 0 || area_j <= 0)
+                if (area_i <= 0 || area_j <= 0)
                 {
                     overlap = 0.0f;
                 }
@@ -177,11 +208,12 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
                     const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
                     const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
                     const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
-                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
-                    overlap                       = area_intersection / (area_i + area_j - area_intersection);
+                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) *
+                                                   std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+                    overlap = area_intersection / (area_i + area_j - area_intersection);
                 }
 
-                if(overlap > _iou_threshold)
+                if (overlap > _iou_threshold)
                 {
                     visited[sorted_indices[j]] = true;
                 }
@@ -190,7 +222,7 @@ void CPPNonMaximumSuppressionKernel::run(const Window &window, const ThreadInfo
     }
     // The output could be full but not the output indices tensor
     // Instead return values not valid we put -1
-    for(; output_idx < _max_output_size; ++output_idx)
+    for (; output_idx < _max_output_size; ++output_idx)
     {
         *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
     }
diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 9d89836589..e68090d82b 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,12 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPPermuteKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 #include <cstddef>
 #include <cstdint>
 
@@ -46,7 +44,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -68,7 +66,7 @@ void CPPPermuteKernel::run_permute(const Window &window)
     // Create output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    for (size_t d = 0; d <= _perm.num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
@@ -77,28 +75,32 @@ void CPPPermuteKernel::run_permute(const Window &window)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    if(_input->info()->num_dimensions() <= 3)
+    if (_input->info()->num_dimensions() <= 3)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
-    else if(_input->info()->num_dimensions() >= 4)
+    else if (_input->info()->num_dimensions() >= 4)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] +
+                                id[3] * perm_strides[3];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
 }
 
-CPPPermuteKernel::CPPPermuteKernel()
-    : _func(), _input(nullptr), _output(nullptr), _perm()
+CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm()
 {
 }
 
@@ -116,7 +118,7 @@ void CPPPermuteKernel::configure(const ITensor *input, ITensor *output, const Pe
     _output = output;
     _perm   = perm;
 
-    switch(input->info()->element_size())
+    switch (input->info()->element_size())
     {
         case 1:
             _func = &CPPPermuteKernel::run_permute<uint8_t>;
@@ -155,7 +157,7 @@ void CPPPermuteKernel::run(const Window &window, const ThreadInfo &info)
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
diff --git a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp b/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
deleted file mode 100644
index edc5e409c2..0000000000
--- a/src/core/CPP/kernels/CPPSortEuclideanDistanceKernel.cpp
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h"
-
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <cmath>
-
-using namespace arm_compute;
-
-namespace
-{
-bool keypoint_compare(const InternalKeypoint &lhs, const InternalKeypoint &rhs)
-{
-    return std::get<2>(lhs) > std::get<2>(rhs);
-}
-} // namespace
-
-CPPSortEuclideanDistanceKernel::CPPSortEuclideanDistanceKernel()
-    : _num_corner_candidates(), _min_distance(0.0f), _in_out(nullptr), _output(nullptr)
-{
-}
-
-void CPPSortEuclideanDistanceKernel::configure(InternalKeypoint *in_out, IKeyPointArray *output, const int32_t *num_corner_candidates, float min_distance)
-{
-    ARM_COMPUTE_ERROR_ON(nullptr == in_out);
-    ARM_COMPUTE_ERROR_ON(nullptr == output);
-    ARM_COMPUTE_ERROR_ON(nullptr == num_corner_candidates);
-    ARM_COMPUTE_ERROR_ON(!((min_distance > 0) && (min_distance <= 30)));
-
-    _in_out                = in_out;
-    _output                = output;
-    _min_distance          = min_distance * min_distance; // We compare squares of distances
-    _num_corner_candidates = num_corner_candidates;
-    ICPPKernel::configure(Window()); // Default 1 iteration window
-}
-
-bool CPPSortEuclideanDistanceKernel::is_parallelisable() const
-{
-    return false;
-}
-
-void CPPSortEuclideanDistanceKernel::run(const Window &window, const ThreadInfo &info)
-{
-    ARM_COMPUTE_UNUSED(info);
-    ARM_COMPUTE_UNUSED(window);
-    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICPPKernel::window(), window);
-
-    const int32_t num_corner_candidates = *_num_corner_candidates;
-
-    /* Sort list of corner candidates */
-    std::sort(_in_out, _in_out + num_corner_candidates, keypoint_compare);
-
-    /* Euclidean distance */
-    for(int32_t i = 0; i < num_corner_candidates; ++i)
-    {
-        if(std::get<2>(_in_out[i]) != 0.0f)
-        {
-            KeyPoint   keypt;
-            const auto xc = std::get<0>(_in_out[i]);
-            const auto yc = std::get<1>(_in_out[i]);
-
-            keypt.x               = xc;
-            keypt.y               = yc;
-            keypt.strength        = std::get<2>(_in_out[i]);
-            keypt.tracking_status = 1;
-
-            /* Store corner */
-            _output->push_back(keypt);
-            for(int32_t k = i + 1; k < num_corner_candidates; ++k)
-            {
-                const float dx = std::fabs(std::get<0>(_in_out[k]) - xc);
-                const float dy = std::fabs(std::get<1>(_in_out[k]) - yc);
-
-                if((dx < _min_distance) && (dy < _min_distance))
-                {
-                    const float d = (dx * dx + dy * dy);
-
-                    if(d < _min_distance)
-                    {
-                        /* Invalidate keypoint */
-                        std::get<2>(_in_out[k]) = 0.0f;
-                    }
-                }
-            }
-        }
-    }
-}
diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index 7f284d4e1e..6ffb68e770 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 ARM Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,46 +22,46 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CPP/kernels/CPPTopKVKernel.h"
-#include "arm_compute/core/Coordinates.h"
-#include "arm_compute/core/Error.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
 namespace arm_compute
 {
 namespace
 {
-template <typename T,
-          typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     const T epsilon = std::numeric_limits<T>::epsilon();
     return (a - b > epsilon);
 }
 
-template < typename T,
-           typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     return (a > b);
 }
 
-Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status validate_arguments(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
 
     ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -74,22 +74,23 @@ Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *tar
 template <typename T>
 void CPPTopKVKernel::run_topkv()
 {
-    for(unsigned int i = 0; i < _batch_size; ++i)
+    for (unsigned int i = 0; i < _batch_size; ++i)
     {
-        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
-        const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i}));
+        const auto predicted_value =
+            *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i}));
 
         // The variable rank indicates how many values there are before the target_class_id
         unsigned int rank = 0;
-        for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+        for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
         {
-            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
-            if(greater_than(current_prediction, predicted_value))
+            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i}));
+            if (greater_than(current_prediction, predicted_value))
             {
                 rank++;
             }
         }
-        *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+        *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k);
     }
 }
 
@@ -98,7 +99,10 @@ CPPTopKVKernel::CPPTopKVKernel()
 {
 }
 
-void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+void CPPTopKVKernel::configure(const ITensor     *predictions,
+                               const ITensor     *targets,
+                               ITensor           *output,
+                               const unsigned int k)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
 
@@ -117,7 +121,10 @@ void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *target
     ICPPKernel::configure(Window()); // Default 1 iteration window
 }
 
-Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions,
+                                const ITensorInfo *targets,
+                                ITensorInfo       *output,
+                                const unsigned int k)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
     return Status{};
@@ -131,7 +138,7 @@ bool CPPTopKVKernel::is_parallelisable() const
 void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(window, info);
-    switch(_predictions->info()->data_type())
+    switch (_predictions->info()->data_type())
     {
         case DataType::F32:
             run_topkv<float>();
diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 8348b4335e..b1efe32446 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 ARM Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,21 +23,16 @@
  */
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
-#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
 #include <cstdint>
 
 namespace arm_compute
 {
-CPPUpsampleKernel::CPPUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info()
 {
 }
 
@@ -87,7 +82,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     const size_t element_size  = _input->info()->element_size();
 
     // The fill value is normally 0, but for quantized types '0' corresponds to the offset
-    switch(_output->info()->data_type())
+    switch (_output->info()->data_type())
     {
         case DataType::QASYMM8:
         {
@@ -107,7 +102,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
 
     // Create window
     Window window_out(window);
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width));
         window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height));
@@ -122,10 +117,7 @@ void CPPUpsampleKernel::run(const Window &window, const ThreadInfo &info)
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        memcpy(out.ptr(), in.ptr(), element_size);
-    },
-    in, out);
+    execute_window_loop(
+        window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
 }
-} // namespace arm_compute
-\ No newline at end of file
+} // namespace arm_compute