10 files changed, 159 insertions, 188 deletions
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index a5d1b61c08..4ddb35f2d5 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.cpp
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h"
+#include "src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/ITensorInfo.h"
@@ -74,7 +74,7 @@ struct DepthwiseConvolutionRunInfo
     const size_t   input_width;
     const size_t   input_depth;
 
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1)
+    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
         : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
           x_start(w.x().start()),
           x_end(w.x().end()),
@@ -110,14 +110,14 @@ inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, u
 }
 
 template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
                                    const Size2D &dilation, const Window &window, bool has_biases)
 {
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
     using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
 
     const VectorType zero_vector = wrapper::vdup_n(static_cast<T>(0), TagType{});
 
@@ -135,9 +135,9 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights,
     Window win_output = window;
     win_output.set(Window::DimX, dim_manual_loop);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -224,10 +224,10 @@ void depthwise_loop_multiplier1_fp(const ITensor *input, const ITensor *weights,
 }
 
 template <typename T>
-void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
+void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
                                const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
 {
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
     Window execution_window = window;
     execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -246,9 +246,9 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -306,23 +306,24 @@ void depthwise_loop_generic_fp(const ITensor *input, const ITensor *weights, con
 }
 
 template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
+    ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
     using TagType                     = typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
     using AccType                     = int32_t;
     using AccArrayType                = std::array<AccType, element_per_vector>;
 
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
     const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
 
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
     const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
     Window execution_window = window;
@@ -339,9 +340,9 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
     Window win_output = window;
     win_output.set(Window::DimX, dim_manual_loop);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -482,18 +483,18 @@ void depthwise_loop_multiplier1_quantized(const ITensor *input, const ITensor *w
 }
 
 template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
     using AccType = int32_t;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
 
-    const int32_t input_qoffset   = input->info()->quantization_info().uniform().offset;
+    const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
-    const int32_t output_qoffset  = output->info()->quantization_info().uniform().offset;
+    const int32_t output_qoffset  = dst->info()->quantization_info().uniform().offset;
     const int32_t k_offset        = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
 
     Window execution_window = window;
@@ -512,9 +513,9 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -585,8 +586,8 @@ void depthwise_loop_generic_quantized(const ITensor *input, const ITensor *weigh
 }
 
 template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases)
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
+                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
 {
     constexpr int half_vec = vector_size / 2;
 
@@ -595,11 +596,11 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
     using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
     using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(input->info()->quantization_info().uniform().offset), TagType{})));
+    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
     const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(output->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
 
     const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
     const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -624,9 +625,9 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
     Window win_output = window;
     win_output.set_dimension_step(Window::DimX, run_info.x_step);
 
-    Iterator input_it(input, win_input);
+    Iterator input_it(src, win_input);
     Iterator weights_it(weights, win_weights);
-    Iterator output_it(output, win_output);
+    Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
     if(has_biases)
@@ -722,16 +723,16 @@ void depthwise_loop_pow2_quantized_per_tensor(const ITensor *input, const ITenso
     input_it, weights_it, biases_it, output_it);
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > input->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > input->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
-    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(0) * info.depth_multiplier) != weights->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
     ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
 
@@ -742,7 +743,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
     if(biases != nullptr)
@@ -750,7 +751,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if(is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -760,36 +761,36 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
         }
     }
 
-    if(output->total_size() != 0)
+    if(dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     return Status{};
 }
 } // namespace
 
-CpuDepthwiseConvolutionNativeKernel::CpuDepthwiseConvolutionNativeKernel()
+CpuDepthwiseConv2dNativeKernel::CpuDepthwiseConv2dNativeKernel()
     : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
 {
 }
 
-void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, weights, (biases != nullptr) ? biases : nullptr, output, info));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
 
     _conv_info        = info.pad_stride_info;
     _depth_multiplier = info.depth_multiplier;
     _dilation         = info.dilation;
     _has_biases       = (biases != nullptr);
 
-    if(is_data_type_quantized(input->data_type()))
+    if(is_data_type_quantized(src->data_type()))
     {
-        const auto input_scale  = input->quantization_info().uniform().scale;
-        const auto output_scale = output->quantization_info().uniform().scale;
+        const auto input_scale  = src->quantization_info().uniform().scale;
+        const auto output_scale = dst->quantization_info().uniform().scale;
 
         auto weights_scale = weights->quantization_info().scale();
         if(!is_data_type_quantized_per_channel(weights->data_type()))
@@ -815,50 +816,50 @@ void CpuDepthwiseConvolutionNativeKernel::configure(const ITensorInfo *input, co
     switch(weights->data_type())
     {
         case DataType::QASYMM8:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
             break;
         case DataType::QASYMM8_SIGNED:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
             break;
         case DataType::QSYMM8_PER_CHANNEL:
-            if(input->data_type() == DataType::QASYMM8)
+            if(src->data_type() == DataType::QASYMM8)
             {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
             }
             else
             {
-                _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
+                _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
             }
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
-            _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
+            _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
             break;
         default:
             ARM_COMPUTE_ERROR("Data type not supported");
             break;
     }
 
-    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
-    auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
 
-    Window win = calculate_max_window(*output, Steps());
+    Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
 }
 
-Status CpuDepthwiseConvolutionNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
     return Status{};
 }
 
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
@@ -873,9 +874,9 @@ void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, cons
     }
 }
 
-template <typename T, typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
-void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                        ITensor *dst, const Window &window, bool has_biases)
+template <typename T, typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
+void CpuDepthwiseConv2dNativeKernel::run_depthwise(const ITensor *src, const ITensor *weights, const ITensor *biases,
+                                                   ITensor *dst, const Window &window, bool has_biases)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
@@ -900,7 +901,7 @@ void CpuDepthwiseConvolutionNativeKernel::run_depthwise(const ITensor *src, cons
     }
 }
 
-void CpuDepthwiseConvolutionNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDepthwiseConv2dNativeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
diff --git a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 242536d441..559c46dc93 100644
--- a/src/core/cpu/kernels/CpuDepthwiseConvolutionNativeKernel.h
+++ b/src/core/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
-#define ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H
+#ifndef ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
+#define ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/common/Macros.h"
@@ -40,46 +40,38 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to run a depthwise convolution native on a tensor. */
-class CpuDepthwiseConvolutionNativeKernel : public ICpuKernel
+class CpuDepthwiseConv2dNativeKernel : public ICpuKernel
 {
 public:
     const char *name() const override
     {
-        return "CpuDepthwiseConvolutionNativeKernel";
+        return "CpuDepthwiseConv2dNativeKernel";
     }
     /** Default constructor */
-    CpuDepthwiseConvolutionNativeKernel();
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConvolutionNativeKernel);
+    CpuDepthwiseConv2dNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDepthwiseConv2dNativeKernel);
 
     /** Initialize the function's source, destination and parameters.
      *
      * @note Supported data layouts: NHWC
      *
-     * @param[in]  input   Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  src     Source tensor. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
      * @param[in]  weights Weights tensor. This is a 3D tensor with dimensions [IFM, W, H].
-     *                     Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
+     *                     Data type supported: Same as @p src or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]  biases  Biases tensor. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                     Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[out] output  Destination tensor. Data type supported: Same as @p input.
+     *                     Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
+     * @param[out] dst     Destination tensor. Data type supported: Same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      *
      */
-    void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDepthwiseConvolutionNativeKernel
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note Supported data layouts: NHWC
-     *
-     * @param[in] input   Source tensor info. DataType supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] weights Weights tensor info. This is a 3D tensor with dimensions [IFM, W, H].
-     *                    Data type supported: Same as @p input or QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] biases  Biases tensor info. A 1D tensor with dimensions [IFM]. Must be nullptr if not needed.
-     *                    Data type supported: Same as @p input, S32 when input is QASYMM8/QASYMM8_SIGNED.
-     * @param[in] output  Destination tensor info. Data type supported: Same as @p input.
-     * @param[in] info    Depthwise convolution meta-data.
+     * Similar to CpuDepthwiseConv2dNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -101,7 +93,7 @@ private:
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using DepthwiseFunctionPtr = void (CpuDepthwiseConvolutionNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
+    using DepthwiseFunctionPtr = void (CpuDepthwiseConv2dNativeKernel::*)(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, const Window &window, bool has_biases);
 
     DepthwiseFunctionPtr _func;
     PadStrideInfo        _conv_info;
@@ -114,4 +106,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_DEPTHWISECONVOLUTIONNATIVEKERNEL_H */
+#endif /* ARM_COMPUTE_CPU_DEPTHWISECONV2DNATIVEKERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
index 4f46eb2bf6..c0fc41525e 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDirectConvolutionKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dKernel.h"
 
 #include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -995,7 +995,7 @@ bool have_zero_x_internal_padding(ITensorInfo *src, ITensorInfo *weights)
 } // namespace
 
 template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
     // This function assumes that input and weights have not padding in channel
 
@@ -1116,7 +1116,7 @@ void CpuDirectConvolutionKernel::convolve_nhwc_optimized(const Window &window, c
 }
 
 template <typename T>
-void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
+void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
 {
     // Declare useful types
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -1219,12 +1219,12 @@ void CpuDirectConvolutionKernel::convolve_nhwc(const Window &window, const ITens
     out);
 }
 
-BorderSize CpuDirectConvolutionKernel::border_size() const
+BorderSize CpuDirectConv2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -1263,7 +1263,7 @@ void CpuDirectConvolutionKernel::configure(ITensorInfo *src, ITensorInfo *weight
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
 {
     unsigned int num_weight_elems_read_per_row   = 0;
     unsigned int num_elems_read_per_iteration    = 0;
@@ -1283,7 +1283,7 @@ Status CpuDirectConvolutionKernel::validate(const ITensorInfo *src, const ITenso
     return Status{};
 }
 
-void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -1376,7 +1376,7 @@ void CpuDirectConvolutionKernel::run_op(ITensorPack &tensors, const Window &wind
         }
     }
 }
-const char *CpuDirectConvolutionKernel::name() const
+const char *CpuDirectConv2dKernel::name() const
 {
     return "CpuDirectConvolutionLayerKernel";
 }
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
index fb8218394b..62ed96f255 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConv2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H
 
 #include "src/core/common/Macros.h"
 #include "src/core/cpu/ICpuKernel.h"
@@ -35,13 +35,13 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the kernel to perform Direct Convolution Layer. */
-class CpuDirectConvolutionKernel : public ICpuKernel
+class CpuDirectConv2dKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuDirectConvolutionKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionKernel);
-    /** Set the input, weights, and output tensors.
+    CpuDirectConv2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dKernel);
+    /** Set the src, weights, and dst tensors.
      *
      * @note: DirectConvolution only works in the following configurations:
      *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
@@ -57,16 +57,9 @@ public:
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src       The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
-     * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
-     *                      Data type supported:Same as @p input.
-     * @param[in] dst       Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
-     * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+     * Similar to CpuDirectConv2dKernel::configure()
      *
      * @return a status
      */
@@ -97,4 +90,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 5f7a574e5a..662d052941 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.cpp
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h"
+#include "src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -384,8 +384,8 @@ void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window,
 }
 } // namespace
 
-void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
-                                                      const DirectConvolutionLayerOutputStageKernelInfo &info)
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+                                                 const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validation step
@@ -483,14 +483,14 @@ void CpuDirectConvolutionOutputStageKernel::configure(ITensorInfo *src, const IT
     }
 }
 
-Status CpuDirectConvolutionOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
-                                                       const DirectConvolutionLayerOutputStageKernelInfo &info)
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
     return Status{};
 }
 
-void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuDirectConv2dOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -504,9 +504,9 @@ void CpuDirectConvolutionOutputStageKernel::run_op(ITensorPack &tensors, const W
     (*_func)(src, bias, window, dst, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
 }
 
-const char *CpuDirectConvolutionOutputStageKernel::name() const
+const char *CpuDirectConv2dOutputStageKernel::name() const
 {
-    return "CpuDirectConvolutionOutputStageKernel";
+    return "CpuDirectConv2dOutputStageKernel";
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index 9eeab194cb..62bc5d41c9 100644
--- a/src/core/cpu/kernels/CpuDirectConvolutionOutputStageKernel.h
+++ b/src/core/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
-#define ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
+#define ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "src/core/common/Macros.h"
@@ -41,33 +41,27 @@ namespace kernels
  * @note For quantized computations (i.e. @p src of S32 type) the output data type for auto-initialization must be passed as part
  *       of the @ref DirectConvolutionLayerOutputStageKernelInfo.
  */
-class CpuDirectConvolutionOutputStageKernel : public ICpuKernel
+class CpuDirectConv2dOutputStageKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuDirectConvolutionOutputStageKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConvolutionOutputStageKernel);
+    CpuDirectConv2dOutputStageKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuDirectConv2dOutputStageKernel);
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
+     * @param[in, out] src  Input to add the bias to. If @p dst is not specified then accumulation is done in-place.
      *                      Data type supported: F16/F32/S32
      * @param[in]      bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[out]     dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
+     * @param[out]     dst  (Optional) If the dst tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
      *                      Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
      *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
      * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
      */
     void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
                    const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuDirectConvolutionOutputStageKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                 Data type supported: F16/F32/S32
-     * @param[in] bias (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p src
-     * @param[in] dst  (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                 Note that in-place computation is only supported for F16/F32. For S32 this must not be nullptr.
-     *                 Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
-     * @param[in] info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
+     * Similar to CpuDirectConv2dOutputStageKernel::configure()
      *
      * @return a status
      */
@@ -90,4 +84,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_DIRECTCONVOLUTION_OUTPUTSTAGE_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_DIRECTCONV2D_OUTPUTSTAGE_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.cpp b/src/core/cpu/kernels/CpuPool2dKernel.cpp
index a55f60d7ad..e6f5890685 100644
--- a/src/core/cpu/kernels/CpuPoolingKernel.cpp
+++ b/src/core/cpu/kernels/CpuPool2dKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuPoolingKernel.h"
+#include "src/core/cpu/kernels/CpuPool2dKernel.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -374,12 +374,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso
 }
 } // namespace
 
-BorderSize CpuPoolingKernel::border_size() const
+BorderSize CpuPool2dKernel::border_size() const
 {
     return _border_size;
 }
 
-void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
@@ -420,7 +420,7 @@ void CpuPoolingKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Pooli
     }
 }
 
-Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -446,7 +446,7 @@ Status CpuPoolingKernel::validate(const ITensorInfo *src, const ITensorInfo *dst
     return Status{};
 }
 
-void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPool2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -505,9 +505,9 @@ void CpuPoolingKernel::run_op(ITensorPack &tensors, const Window &window, const
     uk->ukernel(src, dst, indices, _pool_info, window_src, window);
 }
 
-const char *CpuPoolingKernel::name() const
+const char *CpuPool2dKernel::name() const
 {
-    return "CpuPoolingKernel";
+    return "CpuPool2dKernel";
 }
 } // namespace kernels
 } // namespace cpu
diff --git a/src/core/cpu/kernels/CpuPoolingKernel.h b/src/core/cpu/kernels/CpuPool2dKernel.h
index 87d8f67119..95298004e9 100644
--- a/src/core/cpu/kernels/CpuPoolingKernel.h
+++ b/src/core/cpu/kernels/CpuPool2dKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
 #include "src/core/common/Macros.h"
@@ -35,12 +35,12 @@ namespace cpu
 namespace kernels
 {
 /** Interface for the pooling layer kernel */
-class CpuPoolingKernel : public ICpuKernel
+class CpuPool2dKernel : public ICpuKernel
 {
 public:
     /** Default constructor */
-    CpuPoolingKernel() = default;
-    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPoolingKernel);
+    CpuPool2dKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dKernel);
     /** Configure kernel for a given list of arguments
      *
      * @note F16 are supported for pool sizes 2 and 3 only
@@ -51,14 +51,9 @@ public:
      * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
     void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
-    /** Static function to check if given info will lead to a valid configuration of @ref CpuPoolingKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @note F16 are supported for pool sizes 2 and 3 only
-     *
-     * @param[in] src       Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst       Destination tensor info. Data types supported: Same as @p src.
-     * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
-     * @param[in] indices   (optional) The indices of the maximal values. Data type supported: U32.
+     * Similar to CpuPool2dKernel::configure()
      *
      * @return a status
      */
@@ -80,4 +75,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOLING_KERNEL_H */
+#endif /*ARM_COMPUTE_CPU_POOL2D_KERNEL_H */
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index ccf73883f0..c78ffb9848 100644
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h"
+#include "src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
@@ -41,7 +41,7 @@ namespace kernels
 {
 using namespace arm_compute::misc::shape_calculator;
 
-void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@ void CpuPoolingAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorI
     INEKernel::configure(win);
 }
 
-Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -136,7 +136,7 @@ Status CpuPoolingAssemblyWrapperKernel::validate(const ITensorInfo *src, const I
     return Status{};
 }
 
-void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuPool2dAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(_kernel_asm.get());
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -170,18 +170,18 @@ void CpuPoolingAssemblyWrapperKernel::run_op(ITensorPack &tensors, const Window
                          working_space, info.thread_id, info.num_threads);
 }
 
-size_t CpuPoolingAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
+size_t CpuPool2dAssemblyWrapperKernel::get_working_size(unsigned int num_threads) const
 {
     return _kernel_asm->get_working_size(num_threads);
 }
 
-bool CpuPoolingAssemblyWrapperKernel::is_configured() const
+bool CpuPool2dAssemblyWrapperKernel::is_configured() const
 {
     return _kernel_asm != nullptr;
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
 
@@ -220,7 +220,7 @@ void CpuPoolingAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src,
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPoolingAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
 {
     const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
 
diff --git a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 34ec452deb..3afa4c16a4 100644
--- a/src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.h
+++ b/src/core/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
-#define ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H
+#ifndef ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
 #include "src/core/NEON/kernels/assembly/pooling.hpp"
@@ -41,23 +41,21 @@ namespace kernels
   *
   * Some kernels were written in assembly and highly optimised for specific
   * CPUs like A53 or A55. The arm compute library creates an instance of
-  * CpuPoolingAssemblyWrapperKernel and other auxiliary data structures to
+  * CpuPool2dAssemblyWrapperKernel and other auxiliary data structures to
   * execute a single assembly kernel in the context of an NEFunction.
   *
   */
-class CpuPoolingAssemblyWrapperKernel final : public ICpuKernel
+class CpuPool2dAssemblyWrapperKernel final : public ICpuKernel
 {
 public:
     /** Constructor
      */
-    CpuPoolingAssemblyWrapperKernel()                                   = default;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &)  = delete;
-    CpuPoolingAssemblyWrapperKernel(CpuPoolingAssemblyWrapperKernel &&) = default;
-    CpuPoolingAssemblyWrapperKernel &operator=(CpuPoolingAssemblyWrapperKernel &) = delete;
+    CpuPool2dAssemblyWrapperKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool2dAssemblyWrapperKernel);
 
     const char *name() const override
     {
-        return "CpuPoolingAssemblyWrapperKernel";
+        return "CpuPool2dAssemblyWrapperKernel";
     }
 
     /** Initialise the kernel's src and dst.
@@ -69,13 +67,11 @@ public:
      */
     void configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
 
-    /** Indicates whether or not this function can be used to process the given parameters.
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] src  Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
-     * @param[in] dst  Destination tensor to store the result of pooling. Data types supported: same as @p src.
-     * @param[in] info Pooling meta-data
+     * Similar to CpuPool2dAssemblyWrapperKernel::configure()
      *
-     * @return a status.
+     * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info);
 
@@ -120,4 +116,4 @@ private:
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_POOLING_ASSEMBLY_WRAPPER_KERNEL_H */
+#endif /* ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H */