From 7784c837afd5844fb6dc4d166ff253d983abfd2d Mon Sep 17 00:00:00 2001
From: Abe Mbise <abe.mbise@arm.com>
Date: Thu, 31 May 2018 16:48:41 +0100
Subject: COMPMID-1167: Validation for NEDepthwiseConvolutionLayer

Change-Id: I9689e1a0627dc015dd2ce98417e4c97bb55581bb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/131327
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 .../kernels/NEDepthwiseConvolutionLayer3x3Kernel.h |  21 ++-
 .../core/NEON/kernels/NEDepthwiseIm2ColKernel.h    |  15 ++
 .../NEON/kernels/NEDepthwiseVectorToTensorKernel.h |   8 +
 .../NEON/kernels/NEDepthwiseWeightsReshapeKernel.h |  10 +
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.h     |  10 +
 arm_compute/core/utils/misc/ShapeCalculator.h      |  13 ++
 .../NEON/functions/NEDepthwiseConvolutionLayer.h   |  28 +++
 .../CL/kernels/CLDepthwiseVectorToTensorKernel.cpp |  20 +-
 .../NEDepthwiseConvolutionLayer3x3Kernel.cpp       | 204 +++++++++++++--------
 src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp  |  30 ++-
 .../kernels/NEDepthwiseVectorToTensorKernel.cpp    |  38 +++-
 .../kernels/NEDepthwiseWeightsReshapeKernel.cpp    |  38 ++--
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp   |  64 +++++--
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |  62 +++++++
 .../validation/NEON/DepthwiseConvolutionLayer.cpp  | 177 +++++++++++++-----
 15 files changed, 558 insertions(+), 180 deletions(-)

diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
index bd9e7eb781..3ffafd858f 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.h
@@ -52,9 +52,11 @@ public:
     /** Default move assignment operator */
     NEDepthwiseConvolutionLayer3x3Kernel &operator=(NEDepthwiseConvolutionLayer3x3Kernel &&) = default;
     /** Initialize the function's source, destination, conv and border_size.
+     *
+     * @note Supported data layouts: NCHW and NHWC
      *
      * @param[in]  input            Source tensor. DataType supported: QASYMM8, F32.
-     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in]  weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
      * @param[out] output           Destination tensor. Data type supported: Same as @p input.
      * @param[in]  conv_info        Padding and stride information to use for the convolution.
      * @param[in]  depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
@@ -66,8 +68,8 @@ public:
      * @param[in] input_shape      Input shape
      * @param[in] conv_info        Padding and stride information to use for the convolution.
      * @param[in] dt               Data type of the input and weights
-     * @param[in] data_layout      (Optional) Data layout of the input and weights tensor
      * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     * @param[in] data_layout      (Optional) Data layout of the input and weights tensor
      *
      * @return True if the optimized kernels can be executed else false
      */
@@ -75,6 +77,20 @@ public:
     /** Generates the convolver object */
     void generate_convolver();
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3Kernel
+     *
+     * @note Supported data layouts: NCHW and NHWC
+     *
+     * @param[in] input            Source tensor. DataType supported: QASYMM8, F32.
+     * @param[in] weights          Weights tensor. This is a 3D tensor with dimensions [3, 3, IFM] for NCHW or [IFM, 3, 3] if NHWC data layout. Data type supported: Same as @p input.
+     * @param[in] output           Destination tensor. Data type supported: Same as @p input.
+     * @param[in] conv_info        Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
     BorderSize border_size() const override;
@@ -82,6 +98,7 @@ public:
 private:
     void configure_generic();
     void configure_optimized();
+
     void run_generic(const Window &window, const ThreadInfo &info);
     void run_optimized(const Window &window, const ThreadInfo &info);
     /** Creates an optimized backend convolver object
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
index 9c11cfa425..0d61d3ea38 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseIm2ColKernel.h
@@ -65,6 +65,21 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1);
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseIm2ColKernel
+     *
+     * @param[in] input            The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8, F32
+     * @param[in] output           The output tensor. First 3 lower dimensions represent a transform of each 3D input,
+     *                             while every dimension above 3 represents a batch. Data types supported: Same as @p input
+     * @param[in] kernel_dims      The kernel dimensions (width and height).
+     * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] has_bias         Boolean that specifies if the depthwise convolution has bias.
+     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias = false, unsigned int depth_multiplier = 1);
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
index 458cbd7812..00977a91b4 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.h
@@ -62,6 +62,14 @@ public:
      * @param[in]  conv_h The converted tensor's height.
      */
     void configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseVectorToTensorKernel
+     *
+     * @param[in] input  The input vector to convert. Data type supported: QASYMM8/S32/F32.
+     * @param[in] output The output tensor. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: same as @p input.
+     * @param[in] conv_w The converted tensor's width.
+     * @param[in] conv_h The converted tensor's height.
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
index d00e8a46ed..b78684f993 100644
--- a/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.h
@@ -59,6 +59,16 @@ public:
      */
     void configure(const ITensor *input, ITensor *output, const ITensor *biases);
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseWeightsReshapeKernel
+     *
+     * @param[in] input  The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM]. Data type supported: QASYMM8, F32.
+     * @param[in] output The output tensor. Data type supported: same as @p input.
+     * @param[in] biases (Optional) The input biases to add. Shape [IFM]. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases);
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index a05d591850..7dddaca3a0 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -56,6 +56,16 @@ public:
      */
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
+     *
+     * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F32
+     * @param[in] input1 Second Input tensor. Data types supported: same as @p input.
+     * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
+
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
     BorderSize border_size() const override;
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 221387649f..9bf6b046b4 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -36,6 +36,19 @@ namespace misc
 {
 namespace shape_calculator
 {
+inline TensorShape compute_vector_to_tensor_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
+{
+    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    TensorShape output_shape(input);
+    output_shape.set(idx_w, conv_w);
+    output_shape.set(idx_h, conv_h);
+    output_shape.set(idx_c, input.x() / (conv_w * conv_h));
+
+    return output_shape;
+}
 inline TensorShape compute_permutation_output_shape(const ITensorInfo &input, const PermutationVector &perm)
 {
     TensorShape output_shape = input.tensor_shape();
diff --git a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
index aa4cace7c2..1317fb740e 100644
--- a/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDepthwiseConvolutionLayer.h
@@ -65,6 +65,20 @@ public:
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer3x3
+     *
+     * @param[in] input            Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
+     * @param[in] weights          Weights tensor. These are 3D tensors with shape [3, 3, IFM]. Data type supported: Same as @p input.
+     * @param[in] biases           (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                             Data type supported: Same as @p input.
+     * @param[in] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in] conv_info        Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
+
     // Inherited methods overriden:
     void run() override;
 
@@ -120,6 +134,20 @@ public:
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
 
+    /** Static function to check if given info will lead to a valid configuration of @ref NEDepthwiseConvolutionLayer
+     *
+     * @param[in] input            Source tensor. Data type supported: QASYMM8/F32. (Written to only for border filling).
+     * @param[in] output           Destination tensor. Data type supported: same as @p input.
+     * @param[in] weights          Weights tensor. These are 3D tensors with shape [kernel_x, kernel_y, IFM]. Data type supported: Same as @p input.
+     * @param[in] biases           (Optional) Biases tensor. A 1D tensor with shape [IFM]. Must be nullptr if not needed.
+     *                             Data type supported: Same as @p input, S32 when input is QASYMM8.
+     * @param[in] conv_info        Padding and stride information to use for the convolution.
+     * @param[in] depth_multiplier (Optional) Multiplier to apply to the input's depth in order to retrieve the output's depth. Defaults to 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier = 1);
+
     // Inherited methods overriden:
     void run() override;
     void prepare() override;
diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index e124ee42f3..67b2cc9f55 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
@@ -31,26 +31,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-TensorShape compute_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
-{
-    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    TensorShape output_shape(input);
-    output_shape.set(idx_w, conv_w);
-    output_shape.set(idx_h, conv_h);
-    output_shape.set(idx_c, input.x() / (conv_w * conv_h));
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -58,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, s
 
     if(output->total_size() != 0)
     {
-        TensorShape output_shape = compute_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
+        TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -77,7 +65,7 @@ void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTenso
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    TensorShape output_shape = compute_output_shape(input->info()->tensor_shape(), conv_w, conv_h, output->info()->data_layout());
+    TensorShape output_shape = compute_vector_to_tensor_output_shape(input->info()->tensor_shape(), conv_w, conv_h, output->info()->data_layout());
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), conv_w, conv_h));
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 09728e2a8d..62dabc8d32 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -144,6 +144,112 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_written_pe
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    if(is_optimized)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
+    }
+
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (output->data_type() != DataType::S32));
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input->data_type()) && (output->data_type() != DataType::F32));
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized,
+                                                        IDepthwiseConvolution *convolver = nullptr)
+{
+    Window win;
+    bool   window_changed = false;
+
+    if(is_optimized)
+    {
+        if(convolver != nullptr)
+        {
+            auto win_last = convolver->get_window();
+            win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+
+            // Auto-configure output
+            bool        same_padding = conv_info.has_padding();
+            TensorShape output_shape{ input->tensor_shape() };
+
+            output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
+            output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
+
+            // Output auto inizialitation if not yet initialized
+            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+            // Configure window (optimised)
+            // Set padding in channels
+            const int num_channels = weights->dimension(0);
+            if((num_channels >= 128) && (num_channels % 16 == 0))
+            {
+                input->extend_padding(PaddingSize(0, 4, 0, 0));
+                weights->extend_padding(PaddingSize(0, 4, 0, 0));
+                output->extend_padding(PaddingSize(0, 4, 0, 0));
+            }
+        }
+    }
+    else
+    {
+        // Get convolved dimensions
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const DataType    output_dt    = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
+
+        // Configure kernel window (generic)
+        const unsigned int conv_stride_x = conv_info.stride().first;
+        const unsigned int conv_stride_y = conv_info.stride().second;
+        const unsigned int conv_pad_top  = conv_info.pad_top();
+        const unsigned int conv_pad_left = conv_info.pad_left();
+
+        unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+        unsigned int num_elems_read_per_iteration    = 0;
+
+        switch(input->data_type())
+        {
+            case DataType::QASYMM8:
+                num_elems_read_per_iteration = 16;
+                break;
+            case DataType::F32:
+                num_elems_read_per_iteration = 12;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported.");
+        }
+
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+        AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3, conv_stride_x, conv_stride_y);
+        AccessWindowStatic     weights_access(weights, 0, 0, 3, 3);
+        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+
+        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
@@ -159,8 +265,7 @@ BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
 void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                                      DataLayout data_layout)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _input            = input;
     _output           = output;
@@ -177,6 +282,17 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const
     (_run_optimized) ? configure_optimized() : configure_generic();
 }
 
+Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+
+    bool is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->tensor_shape(), conv_info, input->data_type(), depth_multiplier, input->data_layout());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, is_optimized));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, is_optimized).first);
+    return Status{};
+}
+
 void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -227,90 +343,26 @@ void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
 {
-    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(0) != 3 || _weights->info()->dimension(1) != 3);
-
-    // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info, _depth_multiplier);
-    const DataType    output_dt    = (_input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : _input->info()->data_type();
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*_output->info(),
-                       _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_output->info()->tensor_shape(), output_shape);
-
-    const unsigned int conv_stride_x   = _conv_info.stride().first;
-    const unsigned int conv_stride_y   = _conv_info.stride().second;
-    const unsigned int conv_pad_top    = _conv_info.pad_top();
-    const unsigned int conv_pad_right  = _conv_info.pad_right();
-    const unsigned int conv_pad_bottom = _conv_info.pad_bottom();
-    const unsigned int conv_pad_left   = _conv_info.pad_left();
-
-    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
-
-    unsigned int num_elems_read_per_iteration = 0;
-    switch(_input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            num_elems_read_per_iteration     = 16;
-            _num_elems_written_per_iteration = 16 >> conv_stride_x;
-            break;
-        case DataType::F32:
-            num_elems_read_per_iteration     = 12;
-            _num_elems_written_per_iteration = 16 >> conv_stride_x;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported.");
-    }
-    _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(_num_elems_written_per_iteration));
-
-    AccessWindowRectangle input_access(_input->info(), -conv_pad_left, -conv_pad_top,
-                                       num_elems_read_per_iteration, 3,
-                                       conv_stride_x, conv_stride_y);
-    AccessWindowStatic     weights_access(_weights->info(), 0, 0, 3, 3);
-    AccessWindowHorizontal output_access(_output->info(), 0, _num_elems_written_per_iteration);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
 
-    update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+    _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
+    _border_size                     = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
 
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, false);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_optimized()
 {
-    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
 
     _border_size = BorderSize(0, 0);
     _convolver   = create_convolver_object(_conv_info, _weights, _input, _output);
 
-    // Auto-configure output
-    bool        same_padding = _conv_info.has_padding();
-    TensorShape output_shape{ _input->info()->tensor_shape() };
-
-    output_shape.set(1, _convolver->output_size(output_shape.y(), same_padding)); // Set width
-    output_shape.set(2, _convolver->output_size(output_shape.z(), same_padding)); // Set height
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*_output->info(),
-                       _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-
-    // Set padding in channels
-    const int num_channels = _weights->info()->dimension(0);
-    if((num_channels >= 128) && (num_channels % 16 == 0))
-    {
-        _input->info()->extend_padding(PaddingSize(0, 4, 0, 0));
-        _weights->info()->extend_padding(PaddingSize(0, 4, 0, 0));
-        _output->info()->extend_padding(PaddingSize(0, 4, 0, 0));
-    }
-
-    // Configure window
-    Window win;
-    auto   win_last = _convolver->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, true, _convolver.get());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 5b43e2b14f..91b29cdf03 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -37,6 +37,21 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+    return Status{};
+}
+} // namespace
+
 template <typename T>
 void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
 {
@@ -120,11 +135,9 @@ NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
 
 void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
-    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
 
     _input            = input;
     _output           = output;
@@ -158,6 +171,13 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c
     INEKernel::configure(win);
 }
 
+Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+    return Status{};
+}
+
 void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 86a6d1c1a8..fe141bef56 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -34,8 +34,27 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
 
 template <typename T>
 void NEDepthwiseVectorToTensorKernel::vector_to_tensor(const Window &window)
@@ -76,19 +95,13 @@ NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel()
 
 void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, conv_w);
-    output_shape.set(1, conv_h);
-    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
+    TensorShape output_shape = compute_vector_to_tensor_output_shape(input->info()->tensor_shape(), conv_w, conv_h, output->info()->data_layout());
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), conv_w, conv_h));
 
     _input     = input;
     _output    = output;
@@ -121,6 +134,13 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o
     INEKernel::configure(win);
 }
 
+Status NEDepthwiseVectorToTensorKernel::validate(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_w, conv_h));
+    return Status{};
+}
+
 void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 47fcf12874..2c7a379c25 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -77,6 +77,24 @@ void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output,
     },
     in, out);
 }
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(0) * input->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    return Status{};
+}
 } // namespace
 
 NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
@@ -86,18 +104,9 @@ NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
 
 void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *output, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), (biases != nullptr) ? biases->info() : nullptr));
 
     _input  = input;
     _output = output;
@@ -135,6 +144,13 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
     INEKernel::configure(win);
 }
 
+Status NEDepthwiseWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, biases));
+    return Status{};
+}
+
 void NEDepthwiseWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 8588f43edf..238786953b 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -39,6 +39,43 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != DataType::F32));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(DataLayoutDimension::HEIGHT) != output->dimension(DataLayoutDimension::HEIGHT));
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(DataLayoutDimension::WIDTH) != output->dimension(DataLayoutDimension::WIDTH));
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    const unsigned int num_elems_read_per_iteration = 16 / input0->element_size();
+
+    Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration));
+
+    AccessWindowHorizontal input0_access(input0, 0, num_elems_read_per_iteration);
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration);
+    AccessWindowStatic     output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+    bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 template <typename I0, typename I1, typename O>
 void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out)
 {
@@ -175,10 +212,9 @@ BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
 
 void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
-    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
     _input1 = input1;
@@ -203,17 +239,17 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
     const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
     _border_size                = BorderSize(0, border_x);
 
-    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
-
-    AccessWindowHorizontal input0_access(input0->info(), 0, num_elems_read_per_iteration);
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
-
-    update_window_and_padding(win, input0_access, input1_access, output_access);
-
-    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    INEKernel::configure(win);
+Status NEGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+    return Status{};
 }
 
 void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 1d65dde2a6..3b54ed62c7 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -123,6 +123,16 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
     }
 }
 
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(biases);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    return NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, output, conv_info, depth_multiplier);
+}
+
 void NEDepthwiseConvolutionLayer3x3::run()
 {
     if(_is_first_run && _is_optimized)
@@ -263,6 +273,58 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
     _v2mm_output.allocator()->allocate();
 }
 
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                             unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool         append_bias  = (biases != nullptr) && !is_quantized;
+    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const size_t       weights_w    = weights->dimension(0);
+    const size_t       weights_h    = weights->dimension(1);
+    const size_t       weights_z    = weights->dimension(2);
+    const unsigned int conv_w       = output_shape.x();
+    const unsigned int conv_h       = output_shape.y();
+    const size_t       patch_size   = weights_w * weights_h + (append_bias ? 1 : 0);
+    const size_t       conv_size    = conv_w * conv_h;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+
+    // Im2Col configuration
+    TensorShape shape_im2col = input->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+    // Weights reshape configuration
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+    // GEMV configuration
+    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+    TensorShape shape_v2mm_out = input->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+    }
+
+    return Status{};
+}
+
 void NEDepthwiseConvolutionLayer::run()
 {
     prepare();
diff --git a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
index b1cc491ac8..956fd741df 100644
--- a/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/NEON/DepthwiseConvolutionLayer.cpp
@@ -54,56 +54,139 @@ const auto depth_multipliers = framework::dataset::make("DepthMultiplier", { 1,
 TEST_SUITE(NEON)
 TEST_SUITE(DepthwiseConvLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallDepthwiseConvolutionLayerDataset3x3(),
-                                                                                                      datasets::LargeDepthwiseConvolutionLayerDataset3x3()),
-                                                                           depth_multipliers),
-                                                                   framework::dataset::make("DataType", DataType::F32)),
-               input_shape, kernel_size, info, depth_multiplier, data_type)
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate3x3, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching data type input/weights
+                                                       TensorInfo(TensorShape(32U, 18U, 3U), 1, DataType::F32),     // Mismatching input feature maps
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Unsupported weights dimensions
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Mismatching depth multiplier
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid stride
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid biases size
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid biases dimensions
+                                                       TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),     // Invalid output size
+                                                       TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),     // Window shrink
+                                                     }),
+               framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(5U, 5U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                       })),
+               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(30U, 16U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 18U, 2U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                      })),
+               framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(4, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                      PadStrideInfo(1, 1, 0, 0),
+                                                     })),
+               framework::dataset::make("DepthMultiplier", { 1,
+                                                             1,
+                                                             1,
+                                                             3,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                             1,
+                                                            })),
+               framework::dataset::make("Expected", { false, false, false, false, false, false, false, false, false })),
+               input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, expected)
 {
-    // Get shapes
-    TensorShape weights_shape(kernel_size.width, kernel_size.height);
-
-    const TensorInfo  in_info(input_shape, 1, data_type);
-    const TensorInfo  we_info(weights_shape, 1, data_type);
-    const TensorShape output_shape = compute_depthwise_convolution_shape(in_info, we_info, info, depth_multiplier);
-
-    weights_shape.set(2, output_shape.z());
-
-    // Create tensors
-    Tensor            src     = create_tensor<Tensor>(input_shape, data_type);
-    Tensor            dst     = create_tensor<Tensor>(output_shape, data_type);
-    Tensor            weights = create_tensor<Tensor>(weights_shape, data_type);
-    const TensorShape bias_shape(weights_shape[2]);
-    Tensor            bias = create_tensor<Tensor>(bias_shape, data_type);
-
-    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    NEDepthwiseConvolutionLayer3x3 depthwise_layer;
-    depthwise_layer.configure(&src, &weights, &bias, &dst, info, depth_multiplier);
-
-    // Validate valid region
-    const ValidRegion input_valid_region   = shape_to_valid_region(input_shape);
-    const ValidRegion output_valid_region  = shape_to_valid_region(output_shape);
-    const ValidRegion weights_valid_region = shape_to_valid_region(weights_shape);
-    const ValidRegion bias_valid_region    = shape_to_valid_region(bias_shape);
-
-    validate(src.info()->valid_region(), input_valid_region);
-    validate(dst.info()->valid_region(), output_valid_region);
-    validate(weights.info()->valid_region(), weights_valid_region);
-    validate(bias.info()->valid_region(), bias_valid_region);
+    bool is_valid = bool(NEDepthwiseConvolutionLayer3x3::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
+}
 
-    // Validate padding
-    bool              is_optimized_run = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input_shape, info, data_type, depth_multiplier, DataLayout::NCHW);
-    const int         step_non_opt_dwc = 16 >> info.stride().first;
-    const int         step_bias_add    = 16 / src.info()->element_size();
-    const int         step             = is_optimized_run ? step_bias_add : std::max(step_non_opt_dwc, step_bias_add);
-    const PaddingSize padding          = PaddingCalculator(output_shape.x(), step).required_padding();
-    validate(dst.info()->padding(), padding);
+DATA_TEST_CASE(ValidateGeneric, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching data type input/weights
+                                                        TensorInfo(TensorShape(27U, 13U, 3U), 1, DataType::F32),    // Mismatching input feature maps
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Mismatching depth multiplier
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases size
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid biases dimensions
+                                                        TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),    // Invalid output size
+                                                        TensorInfo(TensorShape(27U, 13U, 8U), 1, DataType::F32),
+                                                        TensorInfo(TensorShape(32U, 13U, 8U), 1, DataType::QASYMM8),
+                                                      }),
+                framework::dataset::make("WeightsInfo", { TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F16),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 2U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 16U), 1, DataType::F32),
+                                                          TensorInfo(TensorShape(3U, 3U, 24U), 1, DataType::QASYMM8),
+                                                        })),
+                framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(2U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(16U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(24U), 1, DataType::S32),
+                                                       })),
+                framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(25U, 11U, 16U), 1, DataType::F32),
+                                                         TensorInfo(TensorShape(32U, 11U, 24U), 1, DataType::QASYMM8),
+                                                       })),
+                framework::dataset::make("ConvInfo", { PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 0, 0),
+                                                       PadStrideInfo(1, 1, 1, 0),
+                                                      })),
+                framework::dataset::make("DepthMultiplier", { 1,
+                                                              1,
+                                                              3,
+                                                              1,
+                                                              1,
+                                                              1,
+                                                              2,
+                                                              3,
+                                                             })),
+                framework::dataset::make("Expected", { false, false, false, false, false, false, true, true })),
+                input_info, weights_info, biases_info, output_info, conv_info, depth_multiplier, expected)
+{
+    bool is_valid = bool(NEDepthwiseConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, depth_multiplier));
+    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);
 }
+// clang-format on
+// *INDENT-ON*
 
 TEST_SUITE(Float)
 TEST_SUITE(F32)
-- 
cgit v1.2.1