15 files changed, 591 insertions, 492 deletions
diff --git a/Android.bp b/Android.bp
index 7851eb6cee..6984bbe7ea 100644
--- a/Android.bp
+++ b/Android.bp
@@ -245,7 +245,6 @@ cc_library_static {
         "src/core/NEON/kernels/NEDilateKernel.cpp",
         "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp",
         "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp",
-        "src/core/NEON/kernels/NEElementwiseOperationKernel.cpp",
         "src/core/NEON/kernels/NEElementwiseUnaryKernel.cpp",
         "src/core/NEON/kernels/NEErodeKernel.cpp",
         "src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp",
@@ -409,6 +408,7 @@ cc_library_static {
         "src/core/cpu/kernels/CpuConcatenateHeightKernel.cpp",
         "src/core/cpu/kernels/CpuConcatenateWidthKernel.cpp",
         "src/core/cpu/kernels/CpuCopyKernel.cpp",
+        "src/core/cpu/kernels/CpuElementwiseKernel.cpp",
         "src/core/cpu/kernels/CpuFillKernel.cpp",
         "src/core/cpu/kernels/CpuFloorKernel.cpp",
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
@@ -678,7 +678,7 @@ cc_library_static {
         "src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp",
         "src/runtime/NEON/functions/NEDilate.cpp",
         "src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp",
-        "src/runtime/NEON/functions/NEElementwiseOperators.cpp",
+        "src/runtime/NEON/functions/NEElementwiseOperations.cpp",
         "src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp",
         "src/runtime/NEON/functions/NEEqualizeHistogram.cpp",
         "src/runtime/NEON/functions/NEErode.cpp",
@@ -787,6 +787,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuAdd.cpp",
         "src/runtime/cpu/operators/CpuConcatenate.cpp",
         "src/runtime/cpu/operators/CpuCopy.cpp",
+        "src/runtime/cpu/operators/CpuElementwise.cpp",
         "src/runtime/cpu/operators/CpuFill.cpp",
         "src/runtime/cpu/operators/CpuFloor.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
index 5c755e96ac..44b70bbe85 100644
--- a/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
+++ b/arm_compute/runtime/NEON/functions/NEElementwiseOperations.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,7 @@ namespace arm_compute
 {
 class ITensor;
 
-/** Basic function to run @ref NEArithmeticOperationKernel for max
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for max
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a max operation between two tensors.
@@ -60,7 +60,7 @@ public:
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for max
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -79,7 +79,7 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for min
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for min
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a min operation between two tensors.
@@ -107,7 +107,7 @@ public:
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for min
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -126,7 +126,7 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for squared difference
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for squared difference
  *
  * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
@@ -154,7 +154,7 @@ public:
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for squared difference
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
      *
      * @param[in] input1   First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -173,7 +173,7 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for division
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
  *
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs a squared different operation between two tensors (i.e., out[i] = in1[i] / in2[i])
@@ -201,7 +201,7 @@ public:
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for division
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -220,7 +220,7 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEArithmeticOperationKernel for power
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
  *
  * @note The tensor data type for the inputs must be F16/F32.
  * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
@@ -249,7 +249,7 @@ public:
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for power
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
      *
      * @param[in] input1   First tensor input info. Data types supported: F16/F32.
      * @param[in] input2   Second tensor input info. Data types supported: Same as @p input1.
@@ -268,7 +268,7 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEComparisonOperationKernel.
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
@@ -296,7 +296,7 @@ public:
      * @param[in]      op     Comparison Operation to be performed.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
@@ -315,7 +315,7 @@ private:
     std::unique_ptr<Impl> _impl;
 };
 
-/** Basic function to run @ref NEComparisonOperationKernel
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
  * @note The function performs a comparison operation between two tensors.
@@ -343,7 +343,7 @@ public:
      * @param[out]     output Output tensor. Data types supported: U16/U32.
      */
     void configure(ITensor *input1, ITensor *input2, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
@@ -374,206 +374,5 @@ using NELess = NEElementwiseComparisonStatic<ComparisonOperation::Less>;
 /** Basic function to run less-equal comparison. */
 using NELessEqual = NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
 
-namespace experimental
-{
-/** Basic function to run @ref NEArithmeticOperationKernel for max
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a max operation between two tensors.
- */
-class NEElementwiseMax : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for max
-     *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-};
-
-/** Basic function to run @ref NEArithmeticOperationKernel for min
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a min operation between two tensors.
- */
-class NEElementwiseMin : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for min
-     *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-};
-
-/** Basic function to run @ref NEArithmeticOperationKernel for squared difference
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
- */
-class NEElementwiseSquaredDiff : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for squared difference
-     *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-};
-
-/** Basic function to run @ref NEArithmeticOperationKernel for division
- *
- * @note The tensor data type for the inputs must be S32/F16/F32.
- * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
- */
-class NEElementwiseDivision : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for division
-     *
-     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-};
-
-/** Basic function to run @ref NEArithmeticOperationKernel for power
- *
- * @note The tensor data type for the inputs must be F16/F32.
- * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
- * @note For an exponent that is a float, this function will only work with a positive base.
- */
-class NEElementwisePower : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel for power
-     *
-     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-};
-
-/** Basic function to run @ref NEComparisonOperationKernel.
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a comparison operation between two tensors.
- */
-class NEElementwiseComparison : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: U16/U32.
-     * @param[in]      op     Comparison Operation to be performed.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
-     *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
-     * @param[in] op     Comparison Operation to be performed.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
-};
-
-/** Basic function to run @ref NEComparisonOperationKernel
- *
- * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
- * @note The function performs a comparison operation between two tensors.
- */
-template <ComparisonOperation op>
-class NEElementwiseComparisonStatic : public INEOperator
-{
-public:
-    /** Initialise the kernel's inputs, output and conversion policy.
-     *
-     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[out]     output Output tensor info. Data types supported: U16/U32.
-     */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
-     *
-     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
-     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
-     * @param[in] output Output tensor info. Data types supported: U16/U32.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
-};
-
-/** Basic function to run equal comparison. */
-using NEEqual = NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
-/** Basic function to run not equal comparison. */
-using NENotEqual = NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-/** Basic function to run greater comparison. */
-using NEGreater = NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
-/** Basic function to run greater-equal comparison. */
-using NEGreaterEqual = NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-/** Basic function to run less comparison. */
-using NELess = NEElementwiseComparisonStatic<ComparisonOperation::Less>;
-/** Basic function to run less-equal comparison. */
-using NELessEqual = NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace experimental
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONS_H */
diff --git a/arm_compute/runtime/NEON/functions/NEPReluLayer.h b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
index 358e633000..12ffb8da7b 100644
--- a/arm_compute/runtime/NEON/functions/NEPReluLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEPReluLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -35,7 +35,7 @@ class ITensorInfo;
 
 namespace experimental
 {
-/** Basic function to run @ref NEArithmeticOperationKernel for PRELU
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
@@ -49,7 +49,7 @@ public:
      * @param[out] output Destination tensor info. Data type supported: same as @p input
      */
     void configure(const ITensorInfo *input, const ITensorInfo *alpha, ITensorInfo *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
      * @param[in] input  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in] alpha  Source alpha tensor info. Data types supported: same of @p input.
@@ -61,7 +61,7 @@ public:
 };
 } // namespace experimental
 
-/** Basic function to run @ref NEArithmeticOperationKernel for PRELU
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for PRELU
  *
  * @note The function implements an activation layer with the PRELU activation function.
  */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index cbdc983159..f2dddbea6f 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -124,8 +124,8 @@ v20.11 Public major release
    - NEArithmeticSubtractionKernel
    - @ref NEPixelWiseMultiplication
    - @ref NEPixelWiseMultiplicationKernel
-   - @ref NEElementwiseDivision
-   - @ref NEDivisionOperationKernel
+   - NEElementwiseDivision
+   - NEDivisionOperationKernel
  - Interface change
    - Properly support softmax axis to have the same meaning as other major frameworks. That is, axis now defines the dimension
      on which Softmax/Logsoftmax is performed. E.g. for input of shape 4x5x6 and axis=1, softmax will be applied to 4x6=24 vectors of size 5.
@@ -569,7 +569,7 @@ v20.02 Public major release
      - @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel
      - @ref CLGEMMLowpMatrixMultiplyNativeKernel
      - @ref NEActivationLayer
-     - @ref NEComparisonOperationKernel
+     - NEComparisonOperationKernel
      - @ref NEConvolutionLayer
      - @ref NEDepthwiseConvolutionLayer
      - NEDepthwiseConvolutionLayer3x3Kernel
@@ -821,7 +821,7 @@ v19.02 Public major release
  - New Neon kernels / functions:
     - @ref NETileKernel / @ref NETile
     - @ref NEFuseBatchNormalizationKernel / @ref NEFuseBatchNormalization
-    - @ref NEElementwiseOperationKernel
+    - NEElementwiseOperationKernel
     - @ref NEElementwiseMax
     - @ref NEElementwiseMin
     - @ref NEElementwiseSquaredDiff
@@ -842,7 +842,7 @@ v19.02 Public major release
     - @ref NEGatherKernel / @ref NEGather
     - @ref NEElementwiseComparison
     - @ref NEElementwiseComparisonStatic
-    - @ref NEComparisonOperationKernel
+    - NEComparisonOperationKernel
     - @ref NEElementwiseDivision
  - New OpenCL kernels / functions:
     - @ref CLSelectKernel / @ref CLSelect
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index a678a86e4c..c009a6d3af 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -54,7 +54,6 @@
 #include "src/core/NEON/kernels/NEDilateKernel.h"
 #include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
 #include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "src/core/NEON/kernels/NEElementwiseUnaryKernel.h"
 #include "src/core/NEON/kernels/NEErodeKernel.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
index b250465e14..ab915b9d72 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.cpp
@@ -21,16 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/elementwise/impl/elementwise_list.h"
-#include "src/core/NEON/kernels/elementwise/impl/elementwise_quantized_list.h"
-#include "src/core/SVE/kernels/elementwise/impl/elementwise_list.h"
-#include "src/core/SVE/kernels/elementwise/impl/elementwise_quantized_list.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
+#include "src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -38,10 +38,14 @@
 
 namespace arm_compute
 {
+namespace cpu
+{
+namespace kernels
+{
 namespace
 {
 using ElementwiseSelector = std::add_pointer<bool(DataType)>::type;
-using UKernelType         = NEElementwiseOperationKernel::ElementwiseFunction;
+using UKernelType         = CpuElementwiseKernel::ElementwiseFunction;
 struct ElementwiseKernel
 {
     const char               *name;
@@ -154,12 +158,7 @@ configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITenso
 }
 } // namespace
 
-NEElementwiseOperationKernel::NEElementwiseOperationKernel()
-    : _function(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr)
-{
-}
-
-Status NEElementwiseOperationKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuElementwiseKernel::validate_arguments_common(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input1, &input2);
@@ -178,7 +177,7 @@ Status NEElementwiseOperationKernel::validate_arguments_common(const ITensorInfo
     return Status{};
 }
 
-void NEElementwiseOperationKernel::configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuElementwiseKernel::configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -192,45 +191,33 @@ void NEElementwiseOperationKernel::configure_common(const ITensorInfo *input1, c
 
     Window win = calculate_max_window(valid_region);
 
-    INEKernel::configure(win);
+    ICpuKernel::configure(win);
 }
 
-void NEElementwiseOperationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info, window);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(tensors.get_const_tensor(TensorType::ACL_SRC_0),
-              tensors.get_const_tensor(TensorType::ACL_SRC_1),
-              tensors.get_tensor(TensorType::ACL_DST), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+    auto src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto dst  = tensors.get_tensor(TensorType::ACL_DST);
+
+    auto function = get_implementation(src0->info(), src1->info(), dst->info());
+    ARM_COMPUTE_ERROR_ON(function == nullptr);
+    function(src0, src1, dst, window);
 }
 
 /** Arithmetic operators (min, max, squared_diff) */
-void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
-    switch(op)
-    {
-        case ArithmeticOperation::MAX:
-            _function = configure_arithm_func<ArithmeticOperation::MAX>(input1, input2, output);
-            break;
-        case ArithmeticOperation::MIN:
-            _function = configure_arithm_func<ArithmeticOperation::MIN>(input1, input2, output);
-            break;
-        case ArithmeticOperation::SQUARED_DIFF:
-            _function = configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(input1, input2, output);
-            break;
-        case ArithmeticOperation::PRELU:
-            _function = configure_arithm_func<ArithmeticOperation::PRELU>(input1, input2, output);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
+    _op = op;
 }
 
-Status NEArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured output
@@ -241,7 +228,7 @@ Status NEArithmeticOperationKernel::validate_arguments(const ITensorInfo &input1
     return validate_arguments_common(input1, input2, output);
 }
 
-Status NEArithmeticOperationKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
@@ -249,22 +236,45 @@ Status NEArithmeticOperationKernel::validate(ArithmeticOperation op, const ITens
     return Status{};
 }
 
+std::function<CpuElementwiseKernel::ElementwiseFunction>
+CpuArithmeticKernel::get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    switch(_op)
+    {
+        case ArithmeticOperation::MAX:
+            return configure_arithm_func<ArithmeticOperation::MAX>(input1, input2, output);
+        case ArithmeticOperation::MIN:
+            return configure_arithm_func<ArithmeticOperation::MIN>(input1, input2, output);
+        case ArithmeticOperation::SQUARED_DIFF:
+            return configure_arithm_func<ArithmeticOperation::SQUARED_DIFF>(input1, input2, output);
+        case ArithmeticOperation::PRELU:
+            return configure_arithm_func<ArithmeticOperation::PRELU>(input1, input2, output);
+        case ArithmeticOperation::DIV:
+            return configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
+        case ArithmeticOperation::POWER:
+            return configure_arithm_func<ArithmeticOperation::POWER>(input1, input2, output);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return nullptr;
+}
+
 /** The division operator */
 
-void NEDivisionOperationKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuDivisionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
-    _function = configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
+    _op = ArithmeticOperation::DIV;
 }
 
-Status NEDivisionOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuDivisionKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::S32, DataType::F16, DataType::F32);
-    return NEArithmeticOperationKernel::validate_arguments(input1, input2, output);
+    return CpuArithmeticKernel::validate_arguments(input1, input2, output);
 }
 
-Status NEDivisionOperationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuDivisionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
@@ -272,20 +282,20 @@ Status NEDivisionOperationKernel::validate(const ITensorInfo *input1, const ITen
 }
 
 /** The power operator */
-void NEPowerOperationKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuPowerKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
-    _function = configure_arithm_func<ArithmeticOperation::POWER>(input1, input2, output);
+    _op = ArithmeticOperation::POWER;
 }
 
-Status NEPowerOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuPowerKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::F16, DataType::F32);
-    return NEArithmeticOperationKernel::validate_arguments(input1, input2, output);
+    return CpuArithmeticKernel::validate_arguments(input1, input2, output);
 }
 
-Status NEPowerOperationKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuPowerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
@@ -293,36 +303,14 @@ Status NEPowerOperationKernel::validate(const ITensorInfo *input1, const ITensor
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
-    switch(op)
-    {
-        case ComparisonOperation::Equal:
-            _function = configure_comp_func<ComparisonOperation::Equal>(input1, input2, output);
-            break;
-        case ComparisonOperation::NotEqual:
-            _function = configure_comp_func<ComparisonOperation::NotEqual>(input1, input2, output);
-            break;
-        case ComparisonOperation::Greater:
-            _function = configure_comp_func<ComparisonOperation::Greater>(input1, input2, output);
-            break;
-        case ComparisonOperation::GreaterEqual:
-            _function = configure_comp_func<ComparisonOperation::GreaterEqual>(input1, input2, output);
-            break;
-        case ComparisonOperation::Less:
-            _function = configure_comp_func<ComparisonOperation::Less>(input1, input2, output);
-            break;
-        case ComparisonOperation::LessEqual:
-            _function = configure_comp_func<ComparisonOperation::LessEqual>(input1, input2, output);
-            break;
-        default:
-            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-    }
+    _op = op;
 }
 
-Status NEComparisonOperationKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+Status CpuComparisonKernel::validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured output
@@ -333,11 +321,36 @@ Status NEComparisonOperationKernel::validate_arguments(const ITensorInfo &input1
     return validate_arguments_common(input1, input2, output);
 }
 
-Status NEComparisonOperationKernel::validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
     return Status{};
 }
+
+std::function<CpuElementwiseKernel::ElementwiseFunction>
+CpuComparisonKernel::get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    switch(_op)
+    {
+        case ComparisonOperation::Equal:
+            return configure_comp_func<ComparisonOperation::Equal>(input1, input2, output);
+        case ComparisonOperation::NotEqual:
+            return configure_comp_func<ComparisonOperation::NotEqual>(input1, input2, output);
+        case ComparisonOperation::Greater:
+            return configure_comp_func<ComparisonOperation::Greater>(input1, input2, output);
+        case ComparisonOperation::GreaterEqual:
+            return configure_comp_func<ComparisonOperation::GreaterEqual>(input1, input2, output);
+        case ComparisonOperation::Less:
+            return configure_comp_func<ComparisonOperation::Less>(input1, input2, output);
+        case ComparisonOperation::LessEqual:
+            return configure_comp_func<ComparisonOperation::LessEqual>(input1, input2, output);
+        default:
+            ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
+    }
+    return nullptr;
+}
+} // namespace kernels
+} // namespace cpu
 } // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.h b/src/core/cpu/kernels/CpuElementwiseKernel.h
index b0037d357f..92cf880172 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.h
+++ b/src/core/cpu/kernels/CpuElementwiseKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,41 +21,35 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
-#define ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H
 
-#include "arm_compute/core/Types.h"
-#include "src/core/NEON/INEKernel.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
 
 namespace arm_compute
 {
 class ITensor;
-
+namespace cpu
+{
+namespace kernels
+{
 /** Interface for an element-wise operation kernel
  *
  * Element-wise operation is computed by:
  * @f[ output(x,y) = OP(input1(x,y), input2(x,y))@f]
  *
  */
-class NEElementwiseOperationKernel : public INEKernel
+class CpuElementwiseKernel : public ICpuKernel
 {
 public:
     const char *name() const override
     {
-        return "NEElementwiseOperationKernel";
+        return "CpuElementwiseKernel";
     }
-    /** Default constructor */
-    NEElementwiseOperationKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseOperationKernel(const NEElementwiseOperationKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEElementwiseOperationKernel &operator=(const NEElementwiseOperationKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEElementwiseOperationKernel(NEElementwiseOperationKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEElementwiseOperationKernel &operator=(NEElementwiseOperationKernel &&) = default;
-    /** Default destructor */
-    ~NEElementwiseOperationKernel() = default;
+
+    CpuElementwiseKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuElementwiseKernel);
 
     /** Common signature for all the specialised arithmetic functions
      *
@@ -64,7 +58,7 @@ public:
      * @param[out] output Output tensor info. Data types supported: Dependent on subclass.
      * @param[in]  window Region on which to execute the kernel.
      */
-    using ElementwiseFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
+    using ElementwiseFunction = void(const ITensor *, const ITensor *, ITensor *, const Window &);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,19 +77,22 @@ protected:
      */
     void configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Function to use for the particular tensor types passed to configure() */
-    std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)> _function;
-
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
+    /** Function to get the micro kernel implementation
+     *
+     * @param[in] input1 First input tensor information
+     * @param[in] input2 Second input tensor information
+     * @param[in] output Output tensor information
+     *
+     * @return the function instance for the micro kernel
+     */
+    virtual std::function<ElementwiseFunction> get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output) = 0;
 };
 
-class NEArithmeticOperationKernel : public NEElementwiseOperationKernel
+class CpuArithmeticKernel : public CpuElementwiseKernel
 {
 public:
     /** Default constructor */
-    NEArithmeticOperationKernel() = default;
+    CpuArithmeticKernel() = default;
 
     /** Configure kernel
      *
@@ -106,7 +103,7 @@ public:
      */
     void configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel
      *
      * @param[in] op     Arithmetic operation to be executed.
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/S16/F16/S32/F32.
@@ -120,13 +117,26 @@ public:
 protected:
     // Inherited methods overridden:
     static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+
+    ArithmeticOperation _op{};
+
+private:
+    /** Function to get the micro kernel implementation
+     *
+     * @param[in] input1 First input tensor information
+     * @param[in] input2 Second input tensor information
+     * @param[in] output Output tensor information
+     *
+     * @return the function instance for the micro kernel
+     */
+    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output) override;
 };
 
-class NEDivisionOperationKernel : public NEArithmeticOperationKernel
+class CpuDivisionKernel : public CpuArithmeticKernel
 {
 public:
     /** Default constructor */
-    NEDivisionOperationKernel() = default;
+    CpuDivisionKernel() = default;
 
     /** Configure kernel
      *
@@ -136,7 +146,7 @@ public:
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEDivisionOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuDivisionKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
@@ -151,11 +161,11 @@ protected:
     static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
 };
 
-class NEPowerOperationKernel : public NEArithmeticOperationKernel
+class CpuPowerKernel : public CpuArithmeticKernel
 {
 public:
     /** Default constructor */
-    NEPowerOperationKernel() = default;
+    CpuPowerKernel() = default;
 
     /** Configure kernel
      *
@@ -165,7 +175,7 @@ public:
      */
     void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEPowerOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuPowerKernel
      *
      * @param[in] input1 First tensor input info. Data types supported: F16/F32.
      * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
@@ -180,11 +190,11 @@ protected:
     static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
 };
 
-class NEComparisonOperationKernel : public NEElementwiseOperationKernel
+class CpuComparisonKernel : public CpuElementwiseKernel
 {
 public:
     /** Default constructor */
-    NEComparisonOperationKernel() = default;
+    CpuComparisonKernel() = default;
 
     /** Configure kernel
      *
@@ -195,7 +205,7 @@ public:
      */
     void configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref NEComparisonOperationKernel
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
      *
      * @param[in] op     Comparison operation to be executed.
      * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
@@ -209,6 +219,21 @@ public:
 protected:
     // Inherited methods overridden:
     static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output);
+
+private:
+    /** Function to get the micro kernel implementation
+     *
+     * @param[in] input1 First input tensor information
+     * @param[in] input2 Second input tensor information
+     * @param[in] output Output tensor information
+     *
+     * @return the function instance for the micro kernel
+     */
+    std::function<ElementwiseFunction> get_implementation(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output) override;
+
+    ComparisonOperation _op{};
 };
+} // namespace kernels
+} // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NEELEMENTWISEOPERATIONKERNEL_H */
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
+\ No newline at end of file
diff --git a/src/core/NEON/kernels/elementwise/impl/elementwise_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
index 43e44be5e2..43e44be5e2 100644
--- a/src/core/NEON/kernels/elementwise/impl/elementwise_list.h
+++ b/src/core/cpu/kernels/elementwise/neon/elementwise_list.h
diff --git a/src/core/NEON/kernels/elementwise/impl/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
index fd1fb002ad..1ff4632f5c 100644
--- a/src/core/NEON/kernels/elementwise/impl/elementwise_quantized_list.h
+++ b/src/core/cpu/kernels/elementwise/neon/elementwise_quantized_list.h
@@ -24,7 +24,7 @@
 #ifndef SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
 #define SRC_CORE_NEON_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H
 
-#include "src/core/NEON/kernels/elementwise/impl/elementwise_list.h"
+#include "src/core/cpu/kernels/elementwise/neon/elementwise_list.h"
 
 namespace arm_compute
 {
diff --git a/src/core/SVE/kernels/elementwise/impl/elementwise_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
index 83c3355de4..83c3355de4 100644
--- a/src/core/SVE/kernels/elementwise/impl/elementwise_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_list.h
diff --git a/src/core/SVE/kernels/elementwise/impl/elementwise_quantized_list.h b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
index e85b0891f5..b6342c727c 100644
--- a/src/core/SVE/kernels/elementwise/impl/elementwise_quantized_list.h
+++ b/src/core/cpu/kernels/elementwise/sve/elementwise_quantized_list.h
@@ -26,7 +26,7 @@
 
 #if defined(__ARM_FEATURE_SVE2)
 
-#include "src/core/SVE/kernels/elementwise/impl/elementwise_list.h"
+#include "src/core/cpu/kernels/elementwise/sve/elementwise_list.h"
 
 namespace arm_compute
 {
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index badcf2e997..946bbb24b8 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,9 +21,9 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include <src/core/NEON/kernels/NEElementwiseOperationKernel.h>
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuElementwise.h"
 
 #include "arm_compute/core/ITensor.h"
 
@@ -31,109 +31,12 @@
 
 namespace arm_compute
 {
-namespace experimental
-{
-void NEElementwiseMax::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::MAX, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
-}
-
-void NEElementwiseMin::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::MIN, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
-}
-
-void NEElementwiseSquaredDiff::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEArithmeticOperationKernel>();
-    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
-}
-
-void NEElementwiseDivision::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEDivisionOperationKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEDivisionOperationKernel::validate(input1, input2, output);
-}
-
-void NEElementwisePower::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEPowerOperationKernel>();
-    k->configure(input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEPowerOperationKernel::validate(input1, input2, output);
-}
-
-template <ComparisonOperation COP>
-void NEElementwiseComparisonStatic<COP>::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEComparisonOperationKernel>();
-    k->configure(COP, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
-{
-    return NEComparisonOperationKernel::validate(COP, input1, input2, output);
-}
-
-void NEElementwiseComparison::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op)
-{
-    auto k = std::make_unique<NEComparisonOperationKernel>();
-    k->configure(op, input1, input2, output);
-    _kernel = std::move(k);
-}
-
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
-{
-    return NEComparisonOperationKernel::validate(op, input1, input2, output);
-}
-
-// Supported Specializations
-template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
-template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
-} // namespace experimental
-
 struct NEElementwiseMax::Impl
 {
-    const ITensor                                  *src_0{ nullptr };
-    const ITensor                                  *src_1{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwiseMax> op{ nullptr };
+    const ITensor                          *src_0{ nullptr };
+    const ITensor                          *src_1{ nullptr };
+    ITensor                                *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr };
 };
 
 NEElementwiseMax::NEElementwiseMax()
@@ -150,14 +53,14 @@ void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *outp
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwiseMax>();
+    _impl->op    = std::make_unique<cpu::CpuElementwiseMax>();
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
 Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return experimental::NEElementwiseMax::validate(input1, input2, output);
+    return cpu::CpuElementwiseMax::validate(input1, input2, output);
 }
 
 void NEElementwiseMax::run()
@@ -171,10 +74,10 @@ void NEElementwiseMax::run()
 
 struct NEElementwiseMin::Impl
 {
-    const ITensor                                  *src_0{ nullptr };
-    const ITensor                                  *src_1{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwiseMin> op{ nullptr };
+    const ITensor                          *src_0{ nullptr };
+    const ITensor                          *src_1{ nullptr };
+    ITensor                                *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr };
 };
 
 NEElementwiseMin::NEElementwiseMin()
@@ -191,14 +94,14 @@ void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *outp
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwiseMin>();
+    _impl->op    = std::make_unique<cpu::CpuElementwiseMin>();
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
 Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return experimental::NEElementwiseMin::validate(input1, input2, output);
+    return cpu::CpuElementwiseMin::validate(input1, input2, output);
 }
 
 void NEElementwiseMin::run()
@@ -212,10 +115,10 @@ void NEElementwiseMin::run()
 
 struct NEElementwiseSquaredDiff::Impl
 {
-    const ITensor                                          *src_0{ nullptr };
-    const ITensor                                          *src_1{ nullptr };
-    ITensor                                                *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwiseSquaredDiff> op{ nullptr };
+    const ITensor                                  *src_0{ nullptr };
+    const ITensor                                  *src_1{ nullptr };
+    ITensor                                        *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr };
 };
 
 NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
@@ -232,14 +135,14 @@ void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITens
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwiseSquaredDiff>();
+    _impl->op    = std::make_unique<cpu::CpuElementwiseSquaredDiff>();
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
 Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return experimental::NEElementwiseSquaredDiff::validate(input1, input2, output);
+    return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
 }
 
 void NEElementwiseSquaredDiff::run()
@@ -253,10 +156,10 @@ void NEElementwiseSquaredDiff::run()
 
 struct NEElementwiseDivision::Impl
 {
-    const ITensor                                       *src_0{ nullptr };
-    const ITensor                                       *src_1{ nullptr };
-    ITensor                                             *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwiseDivision> op{ nullptr };
+    const ITensor                               *src_0{ nullptr };
+    const ITensor                               *src_1{ nullptr };
+    ITensor                                     *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr };
 };
 
 NEElementwiseDivision::NEElementwiseDivision()
@@ -273,14 +176,14 @@ void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwiseDivision>();
+    _impl->op    = std::make_unique<cpu::CpuElementwiseDivision>();
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
 Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return experimental::NEElementwiseDivision::validate(input1, input2, output);
+    return cpu::CpuElementwiseDivision::validate(input1, input2, output);
 }
 
 void NEElementwiseDivision::run()
@@ -294,10 +197,10 @@ void NEElementwiseDivision::run()
 
 struct NEElementwisePower::Impl
 {
-    const ITensor                                    *src_0{ nullptr };
-    const ITensor                                    *src_1{ nullptr };
-    ITensor                                          *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwisePower> op{ nullptr };
+    const ITensor                            *src_0{ nullptr };
+    const ITensor                            *src_1{ nullptr };
+    ITensor                                  *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr };
 };
 
 NEElementwisePower::NEElementwisePower()
@@ -314,14 +217,14 @@ void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *ou
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwisePower>();
+    _impl->op    = std::make_unique<cpu::CpuElementwisePower>();
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
 Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
-    return experimental::NEElementwisePower::validate(input1, input2, output);
+    return cpu::CpuElementwisePower::validate(input1, input2, output);
 }
 
 void NEElementwisePower::run()
@@ -336,10 +239,10 @@ void NEElementwisePower::run()
 template <ComparisonOperation COP>
 struct NEElementwiseComparisonStatic<COP>::Impl
 {
-    const ITensor                                                    *src_0{ nullptr };
-    const ITensor                                                    *src_1{ nullptr };
-    ITensor                                                          *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwiseComparisonStatic<COP>> op{ nullptr };
+    const ITensor                                            *src_0{ nullptr };
+    const ITensor                                            *src_1{ nullptr };
+    ITensor                                                  *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr };
 };
 
 template <ComparisonOperation COP>
@@ -360,14 +263,14 @@ void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *inp
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwiseComparisonStatic<COP>>();
+    _impl->op    = std::make_unique<cpu::CpuElementwiseComparisonStatic<COP>>();
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
 template <ComparisonOperation COP>
 Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
-    return experimental::NEElementwiseComparisonStatic<COP>::validate(input1, input2, output);
+    return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
 }
 
 template <ComparisonOperation COP>
@@ -382,10 +285,10 @@ void                          NEElementwiseComparisonStatic<COP>::run()
 
 struct NEElementwiseComparison::Impl
 {
-    const ITensor                                         *src_0{ nullptr };
-    const ITensor                                         *src_1{ nullptr };
-    ITensor                                               *dst{ nullptr };
-    std::unique_ptr<experimental::NEElementwiseComparison> op{ nullptr };
+    const ITensor                                 *src_0{ nullptr };
+    const ITensor                                 *src_1{ nullptr };
+    ITensor                                       *dst{ nullptr };
+    std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr };
 };
 
 NEElementwiseComparison::NEElementwiseComparison()
@@ -401,13 +304,13 @@ void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITenso
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
-    _impl->op    = std::make_unique<experimental::NEElementwiseComparison>();
+    _impl->op    = std::make_unique<cpu::CpuElementwiseComparison>();
     _impl->op->configure(input1->info(), input2->info(), output->info(), op);
 }
 
 Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
 {
-    return experimental::NEElementwiseComparison::validate(input1, input2, output, op);
+    return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
 }
 
 void NEElementwiseComparison::run()
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index fe656c0be0..d79235747b 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,7 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "src/core/NEON/kernels/NEElementwiseOperationKernel.h"
+#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
 
 namespace arm_compute
 {
@@ -32,14 +32,14 @@ namespace experimental
 {
 void NEPRelu::configure(const ITensorInfo *input, const ITensorInfo *alpha, ITensorInfo *output)
 {
-    auto k = std::make_unique<NEArithmeticOperationKernel>();
+    auto k = std::make_unique<cpu::kernels::CpuArithmeticKernel>();
     k->configure(ArithmeticOperation::PRELU, input, alpha, output);
     _kernel = std::move(k);
 }
 
 Status NEPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
 {
-    return NEArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
+    return cpu::kernels::CpuArithmeticKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
 }
 } // nsamespace experimental
 
diff --git a/src/runtime/cpu/operators/CpuElementwise.cpp b/src/runtime/cpu/operators/CpuElementwise.cpp
new file mode 100644
index 0000000000..322bd09c43
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuElementwise.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuElementwise.h"
+#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void CpuElementwiseMax::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = std::make_unique<kernels::CpuArithmeticKernel>();
+    k->configure(ArithmeticOperation::MAX, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
+}
+
+void CpuElementwiseMin::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = std::make_unique<kernels::CpuArithmeticKernel>();
+    k->configure(ArithmeticOperation::MIN, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
+}
+
+void CpuElementwiseSquaredDiff::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = std::make_unique<kernels::CpuArithmeticKernel>();
+    k->configure(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::CpuArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
+}
+
+void CpuElementwiseDivision::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = std::make_unique<kernels::CpuDivisionKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::CpuDivisionKernel::validate(input1, input2, output);
+}
+
+void CpuElementwisePower::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = std::make_unique<kernels::CpuPowerKernel>();
+    k->configure(input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::CpuPowerKernel::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void CpuElementwiseComparisonStatic<COP>::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
+{
+    auto k = std::make_unique<kernels::CpuComparisonKernel>();
+    k->configure(COP, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+template <ComparisonOperation COP>
+Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return kernels::CpuComparisonKernel::validate(COP, input1, input2, output);
+}
+
+void CpuElementwiseComparison::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op)
+{
+    auto k = std::make_unique<kernels::CpuComparisonKernel>();
+    k->configure(op, input1, input2, output);
+    _kernel = std::move(k);
+}
+
+Status CpuElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+    return kernels::CpuComparisonKernel::validate(op, input1, input2, output);
+}
+
+// Supported Specializations
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuElementwise.h b/src/runtime/cpu/operators/CpuElementwise.h
new file mode 100644
index 0000000000..611a374c26
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuElementwise.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_ELEMENTWISE_H
+#define ARM_COMPUTE_CPU_ELEMENTWISE_H
+
+#include "src/runtime/cpu/ICpuOperator.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for max
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class CpuElementwiseMax : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for max
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for min
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a min operation between two tensors.
+ */
+class CpuElementwiseMin : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for min
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class CpuElementwiseSquaredDiff : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for squared difference
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for division
+ *
+ * @note The tensor data type for the inputs must be S32/F16/F32.
+ * @note The function performs a division operation between two tensors (i.e., out[i] = in1[i] / in2[i])
+ */
+class CpuElementwiseDivision : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for division
+     *
+     * @param[in] input1 First tensor input info. Data types supported: S32/F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuArithmeticKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs a elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ * @note For an exponent that is a float, this function will only work with a positive base.
+ */
+class CpuElementwisePower : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: Same as @p input1.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuArithmeticKernel for power
+     *
+     * @param[in] input1 First tensor input info. Data types supported: F16/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: Same as @p input1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel.
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+class CpuElementwiseComparison : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: U16/U32.
+     * @param[in]      op     Comparison Operation to be performed.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     * @param[in] op     Comparison Operation to be performed.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op);
+};
+
+/** Basic function to run @ref cpu::kernels::CpuComparisonKernel
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+ * @note The function performs a comparison operation between two tensors.
+ */
+template <ComparisonOperation op>
+class CpuElementwiseComparisonStatic : public ICpuOperator
+{
+public:
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in, out] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in, out] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[out]     output Output tensor info. Data types supported: U16/U32.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuComparisonKernel
+     *
+     * @param[in] input1 First tensor input info. Data types supported: QASYMM8/QASYMM8_SIGNED/S16/F16/S32/F32.
+     * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output Output tensor info. Data types supported: U16/U32.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output);
+};
+
+/** Basic function to run equal comparison. */
+using NEEqual = CpuElementwiseComparisonStatic<ComparisonOperation::Equal>;
+/** Basic function to run not equal comparison. */
+using NENotEqual = CpuElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+/** Basic function to run greater comparison. */
+using NEGreater = CpuElementwiseComparisonStatic<ComparisonOperation::Greater>;
+/** Basic function to run greater-equal comparison. */
+using NEGreaterEqual = CpuElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+/** Basic function to run less comparison. */
+using NELess = CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
+/** Basic function to run less-equal comparison. */
+using NELessEqual = CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
+\ No newline at end of file