From 46d44d26183d835d209d7ef1b9023e217dd4019d Mon Sep 17 00:00:00 2001
From: Yair Schwarzbaum <yair.schwarzbaum@arm.com>
Date: Wed, 12 Jan 2022 16:38:58 +0200
Subject:  Enable kernel selection testing (Phase #2)

Resolves COMPMID-4987
Change-Id: I1201ca3eae107989d13b6a2c6d9560de24fe112d
Signed-off-by: Yair Schwarzbaum <yair.schwarzbaum@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7015
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/cpu/ICpuKernel.h                               |   6 +-
 src/cpu/kernels/CpuActivationKernel.h              |   2 +-
 src/cpu/kernels/CpuAddKernel.cpp                   |   4 +-
 src/cpu/kernels/CpuAddKernel.h                     |   2 +-
 src/cpu/kernels/CpuCastKernel.h                    |   2 +-
 src/cpu/kernels/CpuCol2ImKernel.h                  |   2 +-
 src/cpu/kernels/CpuConcatenateBatchKernel.h        |   2 +-
 src/cpu/kernels/CpuConcatenateDepthKernel.h        |   2 +-
 src/cpu/kernels/CpuConcatenateHeightKernel.h       |   2 +-
 src/cpu/kernels/CpuConcatenateWidthKernel.h        |   2 +-
 .../CpuConvertFullyConnectedWeightsKernel.h        |   2 +-
 .../kernels/CpuConvertQuantizedSignednessKernel.h  |   2 +-
 src/cpu/kernels/CpuCopyKernel.h                    |   2 +-
 src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h   |   2 +-
 src/cpu/kernels/CpuDequantizeKernel.h              |   2 +-
 src/cpu/kernels/CpuDirectConv2dKernel.h            |   2 +-
 src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h |   2 +-
 src/cpu/kernels/CpuDirectConv3dKernel.h            |   2 +-
 src/cpu/kernels/CpuElementwiseKernel.h             |   2 +-
 src/cpu/kernels/CpuElementwiseUnaryKernel.cpp      |   2 +-
 src/cpu/kernels/CpuElementwiseUnaryKernel.h        |   2 +-
 src/cpu/kernels/CpuFillKernel.h                    |   2 +-
 src/cpu/kernels/CpuFloorKernel.h                   |   2 +-
 src/cpu/kernels/CpuGemmInterleave4x4Kernel.h       |   2 +-
 src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h  |   2 +-
 src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h |   4 +-
 .../kernels/CpuGemmLowpOffsetContributionKernel.h  |   2 +-
 ...puGemmLowpOffsetContributionOutputStageKernel.h |   2 +-
 .../CpuGemmLowpQuantizeDownInt32ScaleKernel.h      |   2 +-
 ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h |   2 +-
 ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h |   2 +-
 ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h |   2 +-
 src/cpu/kernels/CpuGemmMatrixAdditionKernel.h      |   2 +-
 src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h      |   2 +-
 src/cpu/kernels/CpuGemmTranspose1xWKernel.h        |   2 +-
 src/cpu/kernels/CpuIm2ColKernel.h                  |   2 +-
 src/cpu/kernels/CpuMulKernel.h                     |   4 +-
 src/cpu/kernels/CpuPermuteKernel.h                 |   2 +-
 src/cpu/kernels/CpuPool2dKernel.cpp                |   6 +-
 src/cpu/kernels/CpuPool2dKernel.h                  |   2 +-
 src/cpu/kernels/CpuQuantizeKernel.h                |   2 +-
 src/cpu/kernels/CpuReshapeKernel.h                 |   2 +-
 src/cpu/kernels/CpuScaleKernel.h                   |   2 +-
 src/cpu/kernels/CpuSoftmaxKernel.cpp               | 256 +++++++--------------
 src/cpu/kernels/CpuSoftmaxKernel.h                 |  36 +--
 src/cpu/kernels/CpuSubKernel.h                     |   2 +-
 src/cpu/kernels/CpuTransposeKernel.h               |   2 +-
 src/cpu/kernels/CpuWeightsReshapeKernel.h          |   2 +-
 src/cpu/kernels/CpuWinogradConv2dKernel.h          |   6 +-
 .../CpuDepthwiseConv2dAssemblyWrapperKernel.h      |   2 +-
 .../internal/CpuPool2dAssemblyWrapperKernel.h      |   2 +-
 src/cpu/operators/CpuConcatenate.h                 |   4 +-
 src/cpu/operators/CpuSoftmax.h                     |   6 +-
 53 files changed, 171 insertions(+), 247 deletions(-)

diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h
index 03aec5c08e..8f4106240d 100644
--- a/src/cpu/ICpuKernel.h
+++ b/src/cpu/ICpuKernel.h
@@ -37,12 +37,8 @@ enum class KernelSelectionType
     Supported  /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
 };
 
-using ICpuKernel = arm_compute::ICPPKernel;
-
 template <class Derived>
-/* This is a temp name for stage 1 process of adding UT for multi-ISA.
-In the next stage NewICpuKernel will be called ICpuKernel again */
-class NewICpuKernel : public ICPPKernel
+class ICpuKernel : public ICPPKernel
 {
 public:
     /** Micro-kernel selector
diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index ac974850aa..b0476303f0 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the activation kernel */
-class CpuActivationKernel : public NewICpuKernel<CpuActivationKernel>
+class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
 {
 private:
     using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index deb7379aea..d06621fae0 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp
@@ -214,7 +214,7 @@ void CpuAddKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, I
     // Configure kernel window
     auto win_config = validate_and_configure_window(*src0, *src1, *dst);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    NewICpuKernel::configure(win_config.second);
+    ICpuKernel::configure(win_config.second);
 }
 
 Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
@@ -231,7 +231,7 @@ void CpuAddKernel::run_op(ITensorPack &tensors, const Window &window, const Thre
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(NewICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
 
     ARM_COMPUTE_ERROR_ON(tensors.empty());
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index 93b86de4ae..6638135580 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform addition between two tensors */
-class CpuAddKernel : public NewICpuKernel<CpuAddKernel>
+class CpuAddKernel : public ICpuKernel<CpuAddKernel>
 {
 private:
     using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index 9aeb537044..7679178fa1 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h
@@ -37,7 +37,7 @@ namespace kernels
  *
  * @note When casting between quantized types the scale and zeroPoint are ignored
  */
-class CpuCastKernel : public NewICpuKernel<CpuCastKernel>
+class CpuCastKernel : public ICpuKernel<CpuCastKernel>
 {
 public:
     CpuCastKernel() = default;
diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
index 43be476b2f..deafcc14df 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.h
+++ b/src/cpu/kernels/CpuCol2ImKernel.h
@@ -52,7 +52,7 @@ namespace kernels
  * \end{array} \right)
  * @f]
  */
-class CpuCol2ImKernel : public NewICpuKernel<CpuCol2ImKernel>
+class CpuCol2ImKernel : public ICpuKernel<CpuCol2ImKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
index 2b5946571b..0de68a5d64 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.h
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h
@@ -36,7 +36,7 @@ namespace kernels
 /** Interface for the batch concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
-class CpuConcatenateBatchKernel : public NewICpuKernel<CpuConcatenateBatchKernel>
+class CpuConcatenateBatchKernel : public ICpuKernel<CpuConcatenateBatchKernel>
 {
 public:
     CpuConcatenateBatchKernel() = default;
diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
index 90b68d3a06..5a0edb95bb 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h
@@ -40,7 +40,7 @@ namespace kernels
 /** Interface for the depth concatenate kernel.
  *  The input tensor will be concatenated into the output tensor.
  */
-class CpuConcatenateDepthKernel : public NewICpuKernel<CpuConcatenateDepthKernel>
+class CpuConcatenateDepthKernel : public ICpuKernel<CpuConcatenateDepthKernel>
 {
 public:
     CpuConcatenateDepthKernel() = default;
diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
index 8ace9809cc..74d5d0c2c3 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.h
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h
@@ -36,7 +36,7 @@ namespace kernels
 /** Interface for the height concatenate kernel.
  *  The source tensor will be concatenated into the destination tensor.
  */
-class CpuConcatenateHeightKernel : public NewICpuKernel<CpuConcatenateHeightKernel>
+class CpuConcatenateHeightKernel : public ICpuKernel<CpuConcatenateHeightKernel>
 {
 public:
     CpuConcatenateHeightKernel() = default;
diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
index d5f2ef24d6..418bc51b33 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h
@@ -37,7 +37,7 @@ namespace kernels
 /** Interface for the width concatenate kernel.
  *  The source tensor will be concatenated into the destination tensor.
  */
-class CpuConcatenateWidthKernel : public NewICpuKernel<CpuConcatenateWidthKernel>
+class CpuConcatenateWidthKernel : public ICpuKernel<CpuConcatenateWidthKernel>
 {
 public:
     CpuConcatenateWidthKernel() = default;
diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 001a6fcab0..9a1393323b 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
@@ -41,7 +41,7 @@ namespace kernels
  *
  * @note This function assumes the weights are already reshaped (transposed)
  */
-class CpuConvertFullyConnectedWeightsKernel : public NewICpuKernel<CpuConvertFullyConnectedWeightsKernel>
+class CpuConvertFullyConnectedWeightsKernel : public ICpuKernel<CpuConvertFullyConnectedWeightsKernel>
 {
 public:
     CpuConvertFullyConnectedWeightsKernel() = default;
diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
index 9d5ee39126..b5eaf65487 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel to convert asymmetric signed to asymmetric signed and vice-versa */
-class CpuConvertQuantizedSignednessKernel : public NewICpuKernel<CpuConvertQuantizedSignednessKernel>
+class CpuConvertQuantizedSignednessKernel : public ICpuKernel<CpuConvertQuantizedSignednessKernel>
 {
 public:
     CpuConvertQuantizedSignednessKernel() = default;
diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
index ee4adeb4eb..c9ef8eba76 100644
--- a/src/cpu/kernels/CpuCopyKernel.h
+++ b/src/cpu/kernels/CpuCopyKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel to perform a copy between two tensors */
-class CpuCopyKernel : public NewICpuKernel<CpuCopyKernel>
+class CpuCopyKernel : public ICpuKernel<CpuCopyKernel>
 {
 public:
     CpuCopyKernel() = default;
diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index eae682bb6d..e23a0fac87 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -40,7 +40,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConv2dNativeKernel : public NewICpuKernel<CpuDepthwiseConv2dNativeKernel>
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
 {
 public:
     CpuDepthwiseConv2dNativeKernel() = default;
diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
index 834c039a76..cfa991dc74 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.h
+++ b/src/cpu/kernels/CpuDequantizeKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the dequantization layer kernel. */
-class CpuDequantizeKernel : public NewICpuKernel<CpuDequantizeKernel>
+class CpuDequantizeKernel : public ICpuKernel<CpuDequantizeKernel>
 {
 public:
     CpuDequantizeKernel() = default;
diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index 09fa5898cc..6ec4d4ee04 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConv2dKernel : public NewICpuKernel<CpuDirectConv2dKernel>
+class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
 {
 public:
     CpuDirectConv2dKernel() = default;
diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index 95011f79aa..d3ef17b7c9 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -40,7 +40,7 @@ namespace kernels
  * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
  *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
  */
-class CpuDirectConv2dOutputStageKernel : public NewICpuKernel<CpuDirectConv2dOutputStageKernel>
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel<CpuDirectConv2dOutputStageKernel>
 {
 public:
     CpuDirectConv2dOutputStageKernel() = default;
diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
index 6ae70bd3b7..688f368b9f 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h
@@ -35,7 +35,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform 3D Direct Convolution Layer. */
-class CpuDirectConv3dKernel : public NewICpuKernel<CpuDirectConv3dKernel>
+class CpuDirectConv3dKernel : public ICpuKernel<CpuDirectConv3dKernel>
 {
 private:
     /* Template function for convolution 3d NDHWC */
diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index bb081cbec1..8cd5d58a96 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h
@@ -39,7 +39,7 @@ namespace kernels
  * @f[ dst(x,y) = OP(src0(x,y), src1(x,y))@f]
  *
  */
-class CpuElementwiseKernel : public NewICpuKernel<CpuElementwiseKernel>
+class CpuElementwiseKernel : public ICpuKernel<CpuElementwiseKernel>
 {
 public:
     CpuElementwiseKernel() = default;
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 79c4896924..e8211fe93e 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
@@ -108,7 +108,7 @@ void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo
 
     auto shape_and_window = compute_output_shape_and_window(src.tensor_shape());
     auto_init_if_empty(dst, shape_and_window.first, 1, src.data_type());
-    NewICpuKernel::configure(shape_and_window.second);
+    ICpuKernel::configure(shape_and_window.second);
 }
 
 Status CpuElementwiseUnaryKernel::validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst)
diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
index c520b89618..138049a60c 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
@@ -39,7 +39,7 @@ namespace kernels
  * Element-wise operation is computed by:
  * @f[ dst(x) = OP(src(x))@f]
  */
-class CpuElementwiseUnaryKernel : public NewICpuKernel<CpuElementwiseUnaryKernel>
+class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
 {
 private:
     using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary)>::type;
diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
index 5262ecc5c6..ce41afc462 100644
--- a/src/cpu/kernels/CpuFillKernel.h
+++ b/src/cpu/kernels/CpuFillKernel.h
@@ -35,7 +35,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel for filling a tensor with a given constant value */
-class CpuFillKernel : public NewICpuKernel<CpuFillKernel>
+class CpuFillKernel : public ICpuKernel<CpuFillKernel>
 {
 public:
     CpuFillKernel() = default;
diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
index 2b102a0515..35ab534ca8 100644
--- a/src/cpu/kernels/CpuFloorKernel.h
+++ b/src/cpu/kernels/CpuFloorKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Cpu accelarated kernel to perform a floor operation */
-class CpuFloorKernel : public NewICpuKernel<CpuFloorKernel>
+class CpuFloorKernel : public ICpuKernel<CpuFloorKernel>
 {
 private:
     using FloorKernelPtr = std::add_pointer<void(const void *, void *, int)>::type;
diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 13b46142c4..4fb6a52a8b 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
@@ -52,7 +52,7 @@ namespace kernels
  *
  * After this operation, the dst matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ]
  */
-class CpuGemmInterleave4x4Kernel : public NewICpuKernel<CpuGemmInterleave4x4Kernel>
+class CpuGemmInterleave4x4Kernel : public ICpuKernel<CpuGemmInterleave4x4Kernel>
 {
 public:
     CpuGemmInterleave4x4Kernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
index 6d06f12e54..2cc789d6d9 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
@@ -43,7 +43,7 @@ namespace kernels
  *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
  *
  */
-class CpuGemmLowpMatrixMultiplyKernel : public NewICpuKernel<CpuGemmLowpMatrixMultiplyKernel>
+class CpuGemmLowpMatrixMultiplyKernel : public ICpuKernel<CpuGemmLowpMatrixMultiplyKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
index 6cced66b47..e469629cdb 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
@@ -40,7 +40,7 @@ namespace kernels
  * @note This stage is needed to handle the offset of matrix product
  *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
  */
-class CpuGemmLowpMatrixAReductionKernel : public NewICpuKernel<CpuGemmLowpMatrixAReductionKernel>
+class CpuGemmLowpMatrixAReductionKernel : public ICpuKernel<CpuGemmLowpMatrixAReductionKernel>
 {
 public:
     /** Default constructor */
@@ -98,7 +98,7 @@ private:
  * @note This stage is needed to handle the offset of matrix product
  *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
  */
-class CpuGemmLowpMatrixBReductionKernel : public NewICpuKernel<CpuGemmLowpMatrixBReductionKernel>
+class CpuGemmLowpMatrixBReductionKernel : public ICpuKernel<CpuGemmLowpMatrixBReductionKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index 1d70c0619e..3514ca811d 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
@@ -46,7 +46,7 @@ namespace kernels
  *                   (a_offset * b_offset * k)
  *
  */
-class CpuGemmLowpOffsetContributionKernel : public NewICpuKernel<CpuGemmLowpOffsetContributionKernel>
+class CpuGemmLowpOffsetContributionKernel : public ICpuKernel<CpuGemmLowpOffsetContributionKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
index 13c64f4631..ad8b05e49a 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
@@ -63,7 +63,7 @@ namespace kernels
  *                        (a_offset * b_offset * k)
  */
 
-class CpuGemmLowpOffsetContributionOutputStageKernel : public NewICpuKernel<CpuGemmLowpOffsetContributionOutputStageKernel>
+class CpuGemmLowpOffsetContributionOutputStageKernel : public ICpuKernel<CpuGemmLowpOffsetContributionOutputStageKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index f6e8c816f3..c7813edcd7 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -51,7 +51,7 @@ namespace kernels
  *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ScaleKernel : public NewICpuKernel<CpuGemmLowpQuantizeDownInt32ScaleKernel>
+class CpuGemmLowpQuantizeDownInt32ScaleKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ScaleKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ScaleKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index a9e2560657..681d099695 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -48,7 +48,7 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public NewICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index bfac8681a5..3e615b935e 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -49,7 +49,7 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public NewICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index 5e5683cfc3..b773fdfdcf 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -49,7 +49,7 @@ namespace kernels
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public NewICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
index 64338259e9..4a748218d1 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
@@ -41,7 +41,7 @@ namespace kernels
  *        - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref CpuGemmMatrixMultiplyKernel
  *        - MTX_1 = C
  */
-class CpuGemmMatrixAdditionKernel : public NewICpuKernel<CpuGemmMatrixAdditionKernel>
+class CpuGemmMatrixAdditionKernel : public ICpuKernel<CpuGemmMatrixAdditionKernel>
 {
 public:
     CpuGemmMatrixAdditionKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
index 757b46e9a7..9c3dc8b1a0 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
@@ -39,7 +39,7 @@ namespace kernels
  * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p lhs is a vector and the second input tensor @p rhs a matrix. The implementation also assumes that both tensors have not been reshaped
  *
  */
-class CpuGemmMatrixMultiplyKernel : public NewICpuKernel<CpuGemmMatrixMultiplyKernel>
+class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
 {
 public:
     CpuGemmMatrixMultiplyKernel() = default;
diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
index 2acda35947..0ca92641b7 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
@@ -68,7 +68,7 @@ namespace kernels
  * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor)
  *
  */
-class CpuGemmTranspose1xWKernel : public NewICpuKernel<CpuGemmTranspose1xWKernel>
+class CpuGemmTranspose1xWKernel : public ICpuKernel<CpuGemmTranspose1xWKernel>
 {
 public:
     CpuGemmTranspose1xWKernel() = default;
diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
index d789adef95..8160310da6 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/cpu/kernels/CpuIm2ColKernel.h
@@ -58,7 +58,7 @@ namespace kernels
  * \end{array} \right)
  * @f]
  */
-class CpuIm2ColKernel : public NewICpuKernel<CpuIm2ColKernel>
+class CpuIm2ColKernel : public ICpuKernel<CpuIm2ColKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index 3ab198510f..85fcf88a96 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform multiplication between two tensors */
-class CpuMulKernel : public NewICpuKernel<CpuMulKernel>
+class CpuMulKernel : public ICpuKernel<CpuMulKernel>
 {
 public:
     CpuMulKernel() = default;
@@ -118,7 +118,7 @@ private:
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
-class CpuComplexMulKernel : public NewICpuKernel<CpuComplexMulKernel>
+class CpuComplexMulKernel : public ICpuKernel<CpuComplexMulKernel>
 {
 public:
     CpuComplexMulKernel() = default;
diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
index aae28582b1..9e1b93318e 100644
--- a/src/cpu/kernels/CpuPermuteKernel.h
+++ b/src/cpu/kernels/CpuPermuteKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel to perform tensor permutation given a permutation vector */
-class CpuPermuteKernel : public NewICpuKernel<CpuPermuteKernel>
+class CpuPermuteKernel : public ICpuKernel<CpuPermuteKernel>
 {
 public:
     CpuPermuteKernel() = default;
diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index 953a9ffb67..d0ca2d285d 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp
@@ -315,7 +315,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
     {
         // Configure kernel window
         Window win = calculate_max_window(*dst, Steps());
-        NewICpuKernel::configure(win);
+        ICpuKernel::configure(win);
     }
     else
     {
@@ -323,7 +323,7 @@ void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Poolin
         auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
                                                         pool_size.x(), pool_size.y());
         ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-        NewICpuKernel::configure(win_config.second);
+        ICpuKernel::configure(win_config.second);
     }
 }
 
@@ -356,7 +356,7 @@ void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const T
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(NewICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
 
     const ITensor *src     = tensors.get_const_tensor(TensorType::ACL_SRC_0);
diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
index 7fd3247d6d..c952ea839d 100644
--- a/src/cpu/kernels/CpuPool2dKernel.h
+++ b/src/cpu/kernels/CpuPool2dKernel.h
@@ -35,7 +35,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class CpuPool2dKernel : public NewICpuKernel<CpuPool2dKernel>
+class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
 {
 private:
     using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index 709e1c89c7..28690bea54 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -37,7 +37,7 @@ namespace kernels
  *
  * @note The implementation supports only 3D input tensors
  */
-class CpuQuantizeKernel : public NewICpuKernel<CpuQuantizeKernel>
+class CpuQuantizeKernel : public ICpuKernel<CpuQuantizeKernel>
 {
 public:
     CpuQuantizeKernel() = default;
diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index 6a5c528ecd..17302c6731 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform tensor reshaping */
-class CpuReshapeKernel : public NewICpuKernel<CpuReshapeKernel>
+class CpuReshapeKernel : public ICpuKernel<CpuReshapeKernel>
 {
 public:
     CpuReshapeKernel() = default;
diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
index 94bbdb72a0..e0e9e387bd 100644
--- a/src/cpu/kernels/CpuScaleKernel.h
+++ b/src/cpu/kernels/CpuScaleKernel.h
@@ -35,7 +35,7 @@ namespace cpu
 namespace kernels
 {
 /** Arm(R) Neon(TM) kernel to perform scaling on a tensor */
-class CpuScaleKernel : public NewICpuKernel<CpuScaleKernel>
+class CpuScaleKernel : public ICpuKernel<CpuScaleKernel>
 {
 private:
     /** Scale function to use for the particular function to use */
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index 054adfa23c..6766b10120 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -30,12 +29,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/softmax/list.h"
-
 namespace arm_compute
 {
 namespace cpu
@@ -44,164 +41,60 @@ namespace kernels
 {
 namespace
 {
-struct SoftmaxSelectorData
-{
-    DataType       dt;
-    const CPUInfo &ci;
-};
-using SoftmaxSelectorPtr          = std::add_pointer<bool(const SoftmaxSelectorData &data)>::type;
-using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
-using SoftmaxLogits1DKernelPtr    = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
-
-struct SoftmaxLogits1DKernel
-{
-    const char              *name;
-    const SoftmaxSelectorPtr is_selected;
-    SoftmaxLogits1DKernelPtr ukernel;
-};
-
-struct SoftmaxLogits1DMaxKernel
-{
-    const char                 *name;
-    const SoftmaxSelectorPtr    is_selected;
-    SoftmaxLogits1DMaxKernelPtr ukernel;
-};
-
-static const SoftmaxLogits1DKernel available_logits_1d_kernels[] =
-{
-#if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp32_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_softmax)
-    },
-    {
-        "sve_fp16_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_softmax)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_fp32_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_softmax)
-    },
-#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_softmax)
-    },
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
-
-#if defined(ARM_COMPUTE_ENABLE_SVE2)
-    {
-        "sve2_qu8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_softmax)
-    },
-    {
-        "sve2_qs8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve2(); },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_softmax)
-    },
-#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_qu8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
-    },
-    {
-        "neon_qs8_softmax_logits_1d",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
-    },
-#endif //defined(ARM_COMPUTE_ENABLE_NEON)
-};
-
-static const SoftmaxLogits1DMaxKernel available_logits_1d_max_kernels[] =
+/* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
+static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits =
 {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
     {
         "sve_fp32_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32) && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
         REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_logits)
     },
     {
         "sve_fp16_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16) && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve; },
         REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_logits)
     },
     {
         "sve_qu8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8) && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
         REGISTER_QASYMM8_SVE(arm_compute::cpu::sve_qasymm8_logits)
     },
     {
         "sve_qs8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.ci.has_sve(); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
         REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::sve_qasymm8_signed_logits)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
     {
         "neon_fp32_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F32); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
         REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_logits)
     },
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     {
         "neon_fp16_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::F16); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_logits)
     },
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
     {
         "neon_qu8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
         REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_logits)
     },
     {
         "neon_qs8_logits_1d_max",
-        [](const SoftmaxSelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
         REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_singed_logits)
     },
 #endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
 };
-
-const SoftmaxLogits1DKernel *get_implementation_logits(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_kernels)
-    {
-        if(uk.is_selected({ data.dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
-const SoftmaxLogits1DMaxKernel *get_implementation_logits_max(const SoftmaxSelectorData &data)
-{
-    for(const auto &uk : available_logits_1d_max_kernels)
-    {
-        if(uk.is_selected({ data.dt, CPUInfo::get() }))
-        {
-            return &uk;
-        }
-    }
-    return nullptr;
-}
-
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
     // Validate in case of configured output
     if(output.total_size() != 0)
     {
@@ -209,58 +102,104 @@ Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorI
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
     }
-
     return Status{};
 }
-
-} // namespace
-
+} //namespace
+const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> &CpuLogits1DMaxKernel::get_available_kernels()
+{
+    return available_kernels_max_logits;
+}
 void CpuLogits1DMaxKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_1d_max(*src, *dst));
-
     // Softmax across the x dimension
     const TensorShape output_shape = TensorShape(src->tensor_shape()).set(0, 1);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
-
-    const auto *uk = get_implementation_logits_max(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
     _run_method = uk->ukernel;
     _name       = std::string("CpuLogits1DMaxKernel").append("/").append(uk->name);
-
-    Window win = calculate_max_window(*src, Steps());
+    Window win  = calculate_max_window(*src, Steps());
     ICpuKernel::configure(win);
 }
-
 Status CpuLogits1DMaxKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_1d_max(*src, *dst));
-
     return Status{};
 }
-
 void CpuLogits1DMaxKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-
     _run_method(src, dst, window);
 }
-
 const char *CpuLogits1DMaxKernel::name() const
 {
     return _name.c_str();
 }
 
+/* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
+template <bool                                                                             IS_LOG>
+static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits =
+{
+#if defined(ARM_COMPUTE_ENABLE_SVE)
+    {
+        "sve_fp32_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
+        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_softmax)
+    },
+    {
+        "sve_fp16_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve; },
+        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_softmax)
+    },
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "neon_fp32_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_softmax)
+    },
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+    {
+        "neon_fp16_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
+        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_softmax)
+    },
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
+#endif /* defined(ARM_COMPUTE_ENABLE_NEON) */
+#if defined(ARM_COMPUTE_ENABLE_SVE2)
+    {
+        "sve2_qu8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_softmax)
+    },
+    {
+        "sve2_qs8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_softmax)
+    },
+#endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
+#if defined(ARM_COMPUTE_ENABLE_NEON)
+    {
+        "neon_qu8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
+        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
+    },
+    {
+        "neon_qs8_softmax_logits_1d",
+        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
+    },
+#endif //defined(ARM_COMPUTE_ENABLE_NEON)
+};
 namespace
 {
 Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
@@ -270,14 +209,11 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
     // Check input
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
-
     // Check max
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(src.tensor_shape()).set(0, 1), max.tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
-
     // Check output if configured
     if(dst.total_size() != 0)
     {
@@ -286,7 +222,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
     }
-
     // Check tmp if configured
     if(tmp.total_size() != 0)
     {
@@ -296,84 +231,69 @@ Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorIn
         // on the maximum number of threads that will run in parallel.
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &tmp);
     }
-
     return Status{};
 }
 } // namespace
-
+template <bool IS_LOG>
+const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+{
+    return available_kernels_logits<IS_LOG>;
+}
 template <bool IS_LOG>
 void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
     // Configure kernel window
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
-
     // Output auto initialization if not yet initialized
     const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
-
     // Tmp auto initialization if not yet initialized
     const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
     auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
-
-    const auto *uk = get_implementation_logits(SoftmaxSelectorData{ src->data_type(), CPUInfo::get() });
+    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
-
     std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
-
-    _beta       = beta;
-    _run_method = uk->ukernel;
-    _name       = kernel_name.append("/").append(uk->name);
-
+    _beta                   = beta;
+    _run_method             = uk->ukernel;
+    _name                   = kernel_name.append("/").append(uk->name);
     // Configure kernel window
     Window win = calculate_max_window(*max, Steps());
-
-    ICpuKernel::configure(win);
+    ICPPKernel::configure(win);
 }
-
 template <bool IS_LOG>
 Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
                                                   const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
-
     return Status{};
 }
-
 template <bool IS_LOG>
 void CpuLogits1DSoftmaxKernel<IS_LOG>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
-
-    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
-    auto       max = tensors.get_tensor(TensorType::ACL_SRC_1);
-    auto       dst = tensors.get_tensor(TensorType::ACL_DST_0);
-    auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
-
+    const auto         src                               = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto               max                               = tensors.get_tensor(TensorType::ACL_SRC_1);
+    auto               dst                               = tensors.get_tensor(TensorType::ACL_DST_0);
+    auto               tmp                               = tensors.get_tensor(TensorType::ACL_DST_1);
     const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
     const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
-
     ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
-
     void *tmp_for_thread = tmp->buffer() + (info.thread_id * tmp_size_for_thread);
     _run_method(src, max, tmp_for_thread, dst, _beta, IS_LOG, window);
 }
-
 template <bool IS_LOG>
 const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
 {
     return _name.c_str();
 }
-
 template class CpuLogits1DSoftmaxKernel<true>;
 template class CpuLogits1DSoftmaxKernel<false>;
-
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index f317662620..df7d3f7d9b 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h
@@ -23,10 +23,8 @@
  */
 #ifndef ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
 #define ARM_COMPUTE_CPU_SOFTMAX_KERNEL_H
-
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
-
 namespace arm_compute
 {
 namespace cpu
@@ -34,8 +32,11 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the identifying the max value of 1D Logits */
-class CpuLogits1DMaxKernel : public NewICpuKernel<CpuLogits1DMaxKernel>
+class CpuLogits1DMaxKernel : public ICpuKernel<CpuLogits1DMaxKernel>
 {
+private:
+    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+
 public:
     CpuLogits1DMaxKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DMaxKernel);
@@ -52,27 +53,31 @@ public:
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
-
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
-
-private:
-    using SoftmaxLogits1DMaxKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &)>::type;
+    struct SoftmaxLogits1DMaxKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        SoftmaxLogits1DMaxKernelPtr  ukernel;
+    };
+    static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
 
 private:
     SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
     std::string                 _name{};
 };
-
 /** Interface for softmax computation for QASYMM8 with pre-computed max. */
 template <bool IS_LOG = false>
-class CpuLogits1DSoftmaxKernel : public NewICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
+class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
 {
+private:
+    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+
 public:
     CpuLogits1DSoftmaxKernel() = default;
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuLogits1DSoftmaxKernel);
-
     /** Set the input and output tensors.
      *
      * @param[in]  src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -92,13 +97,16 @@ public:
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *max,
                            const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
-
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
-
-private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+    struct SoftmaxLogits1DKernel
+    {
+        const char                  *name;
+        const DataTypeISASelectorPtr is_selected;
+        SoftmaxLogits1DKernelPtr     ukernel;
+    };
+    static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
 
 private:
     float                    _beta{ 1.0f };
diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index 70f34b1b57..323a3f1316 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform subtraction between two tensors */
-class CpuSubKernel : public NewICpuKernel<CpuSubKernel>
+class CpuSubKernel : public ICpuKernel<CpuSubKernel>
 {
 private:
     using SubKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
index 7e1ee5f73d..cb85daeb40 100644
--- a/src/cpu/kernels/CpuTransposeKernel.h
+++ b/src/cpu/kernels/CpuTransposeKernel.h
@@ -34,7 +34,7 @@ namespace cpu
 namespace kernels
 {
 /** Kernel which transposes the elements of a matrix */
-class CpuTransposeKernel : public NewICpuKernel<CpuTransposeKernel>
+class CpuTransposeKernel : public ICpuKernel<CpuTransposeKernel>
 {
 public:
     CpuTransposeKernel() = default;
diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
index 6c2d7ef6f9..1a260edc96 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.h
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -56,7 +56,7 @@ namespace kernels
  * \end{array} \right)
  * @f]
  */
-class CpuWeightsReshapeKernel : public NewICpuKernel<CpuWeightsReshapeKernel>
+class CpuWeightsReshapeKernel : public ICpuKernel<CpuWeightsReshapeKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
index 0c4e28c394..6909216d94 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.h
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h
@@ -35,7 +35,7 @@ namespace arm_compute
 namespace cpu
 {
 /** Interface for the kernel to perform Winograd input transform. */
-class ICpuWinogradConv2dTransformInputKernel : public NewICpuKernel<ICpuWinogradConv2dTransformInputKernel>
+class ICpuWinogradConv2dTransformInputKernel : public ICpuKernel<ICpuWinogradConv2dTransformInputKernel>
 {
 public:
     /** Get the working space required to perform the transformation.
@@ -216,7 +216,7 @@ private:
 };
 
 /** Interface for the kernel to perform Winograd output transform. */
-class ICpuWinogradConv2dTransformOutputKernel : public NewICpuKernel<ICpuWinogradConv2dTransformOutputKernel>
+class ICpuWinogradConv2dTransformOutputKernel : public ICpuKernel<ICpuWinogradConv2dTransformOutputKernel>
 {
 public:
     /** Get the working space required to perform the transformation.
@@ -418,7 +418,7 @@ private:
 };
 
 /** Interface for the kernel to perform Winograd weights transform. */
-class ICpuWinogradConv2dTransformWeightsKernel : public NewICpuKernel<ICpuWinogradConv2dTransformWeightsKernel>
+class ICpuWinogradConv2dTransformWeightsKernel : public ICpuKernel<ICpuWinogradConv2dTransformWeightsKernel>
 {
 public:
     /** Prevent instances of this class from being copied (As this class contains pointers) */
diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
index ea51d5d54d..a32a7a3ec8 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
@@ -45,7 +45,7 @@ namespace cpu
 namespace kernels
 {
 /** This class is a wrapper for the depthwise convolution assembly kernels.  */
-class CpuDepthwiseConv2dAssemblyWrapperKernel final : public NewICpuKernel<CpuDepthwiseConv2dAssemblyWrapperKernel>
+class CpuDepthwiseConv2dAssemblyWrapperKernel final : public ICpuKernel<CpuDepthwiseConv2dAssemblyWrapperKernel>
 {
 public:
     /** Default constructor */
diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index daa3168beb..8713d5c54d 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -46,7 +46,7 @@ namespace kernels
   * execute a single assembly kernel in the context of an NEFunction.
   *
   */
-class CpuPool2dAssemblyWrapperKernel final : public NewICpuKernel<CpuPool2dAssemblyWrapperKernel>
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel<CpuPool2dAssemblyWrapperKernel>
 {
 public:
     /** Constructor
diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index 001ac68162..eb11926b48 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -67,7 +67,7 @@ public:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::vector<std::unique_ptr<ICpuKernel>> _concat_kernels{};
+    std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
     unsigned int                             _num_srcs{ 0 };
     unsigned int                             _axis{ 0 };
 };
diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 20f3f006d3..64df8704f9 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -92,8 +92,8 @@ private:
 
     CpuPermute                  _permute_input;
     CpuPermute                  _permute_output;
-    std::unique_ptr<ICpuKernel> _max_kernel;
-    std::unique_ptr<ICpuKernel> _softmax_kernel;
+    std::unique_ptr<ICPPKernel> _max_kernel;
+    std::unique_ptr<ICPPKernel> _softmax_kernel;
 
     TensorInfo _max;
     TensorInfo _tmp;
-- 
cgit v1.2.1