From 7784c837afd5844fb6dc4d166ff253d983abfd2d Mon Sep 17 00:00:00 2001
From: Abe Mbise <abe.mbise@arm.com>
Date: Thu, 31 May 2018 16:48:41 +0100
Subject: COMPMID-1167: Validation for NEDepthwiseConvolutionLayer

Change-Id: I9689e1a0627dc015dd2ce98417e4c97bb55581bb
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/131327
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
---
 .../CL/kernels/CLDepthwiseVectorToTensorKernel.cpp |  20 +-
 .../NEDepthwiseConvolutionLayer3x3Kernel.cpp       | 204 +++++++++++++--------
 src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp  |  30 ++-
 .../kernels/NEDepthwiseVectorToTensorKernel.cpp    |  38 +++-
 .../kernels/NEDepthwiseWeightsReshapeKernel.cpp    |  38 ++--
 .../kernels/NEGEMMMatrixVectorMultiplyKernel.cpp   |  64 +++++--
 .../NEON/functions/NEDepthwiseConvolutionLayer.cpp |  62 +++++++
 7 files changed, 325 insertions(+), 131 deletions(-)

(limited to 'src')

diff --git a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
index e124ee42f3..67b2cc9f55 100644
--- a/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseVectorToTensorKernel.cpp
@@ -31,26 +31,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
 
 namespace
 {
-TensorShape compute_output_shape(const TensorShape &input, size_t conv_w, size_t conv_h, const DataLayout &data_layout)
-{
-    const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    TensorShape output_shape(input);
-    output_shape.set(idx_w, conv_w);
-    output_shape.set(idx_h, conv_h);
-    output_shape.set(idx_c, input.x() / (conv_w * conv_h));
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -58,7 +46,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, s
 
     if(output->total_size() != 0)
     {
-        TensorShape output_shape = compute_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
+        TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -77,7 +65,7 @@ void CLDepthwiseVectorToTensorKernel::configure(const ICLTensor *input, ICLTenso
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    TensorShape output_shape = compute_output_shape(input->info()->tensor_shape(), conv_w, conv_h, output->info()->data_layout());
+    TensorShape output_shape = compute_vector_to_tensor_output_shape(input->info()->tensor_shape(), conv_w, conv_h, output->info()->data_layout());
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), conv_w, conv_h));
diff --git a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
index 09728e2a8d..62dabc8d32 100644
--- a/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseConvolutionLayer3x3Kernel.cpp
@@ -144,6 +144,112 @@ inline void convolve_3x3(const Window &window, unsigned int num_elems_written_pe
             ARM_COMPUTE_ERROR("Not implemented");
     }
 }
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+
+    if(is_optimized)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) != 3 || weights->dimension(2) != 3);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != 3 || weights->dimension(1) != 3);
+        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.stride().first < 1 || conv_info.stride().first > 3);
+    }
+
+    if(output->total_size() != 0)
+    {
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (output->data_type() != DataType::S32));
+        ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input->data_type()) && (output->data_type() != DataType::F32));
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier, bool is_optimized,
+                                                        IDepthwiseConvolution *convolver = nullptr)
+{
+    Window win;
+    bool   window_changed = false;
+
+    if(is_optimized)
+    {
+        if(convolver != nullptr)
+        {
+            auto win_last = convolver->get_window();
+            win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+
+            // Auto-configure output
+            bool        same_padding = conv_info.has_padding();
+            TensorShape output_shape{ input->tensor_shape() };
+
+            output_shape.set(1, convolver->output_size(output_shape.y(), same_padding)); // Set width
+            output_shape.set(2, convolver->output_size(output_shape.z(), same_padding)); // Set height
+
+            // Output auto inizialitation if not yet initialized
+            auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+
+            // Configure window (optimised)
+            // Set padding in channels
+            const int num_channels = weights->dimension(0);
+            if((num_channels >= 128) && (num_channels % 16 == 0))
+            {
+                input->extend_padding(PaddingSize(0, 4, 0, 0));
+                weights->extend_padding(PaddingSize(0, 4, 0, 0));
+                output->extend_padding(PaddingSize(0, 4, 0, 0));
+            }
+        }
+    }
+    else
+    {
+        // Get convolved dimensions
+        const TensorShape output_shape = compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+        const DataType    output_dt    = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+
+        // Output auto inizialitation if not yet initialized
+        auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
+
+        // Configure kernel window (generic)
+        const unsigned int conv_stride_x = conv_info.stride().first;
+        const unsigned int conv_stride_y = conv_info.stride().second;
+        const unsigned int conv_pad_top  = conv_info.pad_top();
+        const unsigned int conv_pad_left = conv_info.pad_left();
+
+        unsigned int num_elems_written_per_iteration = 16 >> conv_stride_x;
+        unsigned int num_elems_read_per_iteration    = 0;
+
+        switch(input->data_type())
+        {
+            case DataType::QASYMM8:
+                num_elems_read_per_iteration = 16;
+                break;
+            case DataType::F32:
+                num_elems_read_per_iteration = 12;
+                break;
+            default:
+                ARM_COMPUTE_ERROR("Data type not supported.");
+        }
+
+        // Configure kernel window
+        win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));
+
+        AccessWindowRectangle  input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration, 3, conv_stride_x, conv_stride_y);
+        AccessWindowStatic     weights_access(weights, 0, 0, 3, 3);
+        AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
+
+        window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
 } // namespace
 
 NEDepthwiseConvolutionLayer3x3Kernel::NEDepthwiseConvolutionLayer3x3Kernel()
@@ -159,8 +265,7 @@ BorderSize NEDepthwiseConvolutionLayer3x3Kernel::border_size() const
 void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
                                                      DataLayout data_layout)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _input            = input;
     _output           = output;
@@ -177,6 +282,17 @@ void NEDepthwiseConvolutionLayer3x3Kernel::configure(const ITensor *input, const
     (_run_optimized) ? configure_optimized() : configure_generic();
 }
 
+Status NEDepthwiseConvolutionLayer3x3Kernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+
+    bool is_optimized = NEDepthwiseConvolutionLayer3x3Kernel::is_optimized_execution_possible(input->tensor_shape(), conv_info, input->data_type(), depth_multiplier, input->data_layout());
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info, depth_multiplier, is_optimized));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, depth_multiplier, is_optimized).first);
+    return Status{};
+}
+
 void NEDepthwiseConvolutionLayer3x3Kernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -227,90 +343,26 @@ void NEDepthwiseConvolutionLayer3x3Kernel::generate_convolver()
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_generic()
 {
-    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(0) != 3 || _weights->info()->dimension(1) != 3);
-
-    // Get convolved dimensions
-    const TensorShape output_shape = compute_depthwise_convolution_shape(*_input->info(), *_weights->info(), _conv_info, _depth_multiplier);
-    const DataType    output_dt    = (_input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : _input->info()->data_type();
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*_output->info(),
-                       _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_type(output_dt));
-
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_output->info()->tensor_shape(), output_shape);
-
-    const unsigned int conv_stride_x   = _conv_info.stride().first;
-    const unsigned int conv_stride_y   = _conv_info.stride().second;
-    const unsigned int conv_pad_top    = _conv_info.pad_top();
-    const unsigned int conv_pad_right  = _conv_info.pad_right();
-    const unsigned int conv_pad_bottom = _conv_info.pad_bottom();
-    const unsigned int conv_pad_left   = _conv_info.pad_left();
-
-    ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 3);
-
-    unsigned int num_elems_read_per_iteration = 0;
-    switch(_input->info()->data_type())
-    {
-        case DataType::QASYMM8:
-            num_elems_read_per_iteration     = 16;
-            _num_elems_written_per_iteration = 16 >> conv_stride_x;
-            break;
-        case DataType::F32:
-            num_elems_read_per_iteration     = 12;
-            _num_elems_written_per_iteration = 16 >> conv_stride_x;
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data type not supported.");
-    }
-    _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
-
-    // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(_num_elems_written_per_iteration));
-
-    AccessWindowRectangle input_access(_input->info(), -conv_pad_left, -conv_pad_top,
-                                       num_elems_read_per_iteration, 3,
-                                       conv_stride_x, conv_stride_y);
-    AccessWindowStatic     weights_access(_weights->info(), 0, 0, 3, 3);
-    AccessWindowHorizontal output_access(_output->info(), 0, _num_elems_written_per_iteration);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
 
-    update_window_and_padding(win, input_access, weights_access, output_access);
-    output_access.set_valid_region(win, ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+    _num_elems_written_per_iteration = 16 >> _conv_info.stride().first;
+    _border_size                     = BorderSize(_conv_info.pad_top(), _conv_info.pad_right(), _conv_info.pad_bottom(), _conv_info.pad_left());
 
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, false);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::configure_optimized()
 {
-    ARM_COMPUTE_ERROR_ON(_weights->info()->dimension(1) != 3 || _weights->info()->dimension(2) != 3);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, _run_optimized));
 
     _border_size = BorderSize(0, 0);
     _convolver   = create_convolver_object(_conv_info, _weights, _input, _output);
 
-    // Auto-configure output
-    bool        same_padding = _conv_info.has_padding();
-    TensorShape output_shape{ _input->info()->tensor_shape() };
-
-    output_shape.set(1, _convolver->output_size(output_shape.y(), same_padding)); // Set width
-    output_shape.set(2, _convolver->output_size(output_shape.z(), same_padding)); // Set height
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*_output->info(),
-                       _input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
-
-    // Set padding in channels
-    const int num_channels = _weights->info()->dimension(0);
-    if((num_channels >= 128) && (num_channels % 16 == 0))
-    {
-        _input->info()->extend_padding(PaddingSize(0, 4, 0, 0));
-        _weights->info()->extend_padding(PaddingSize(0, 4, 0, 0));
-        _output->info()->extend_padding(PaddingSize(0, 4, 0, 0));
-    }
-
-    // Configure window
-    Window win;
-    auto   win_last = _convolver->get_window();
-    win.set(Window::DimX, Window::Dimension(0, win_last, 1));
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(_input->info(), _weights->info(), _output->info(), _conv_info, _depth_multiplier, true, _convolver.get());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 void NEDepthwiseConvolutionLayer3x3Kernel::run_generic(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 5b43e2b14f..91b29cdf03 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -37,6 +37,21 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(2) * depth_multiplier) != output->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+    return Status{};
+}
+} // namespace
+
 template <typename T>
 void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
 {
@@ -120,11 +135,9 @@ NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
 
 void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
-    ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != output->info()->dimension(2));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, depth_multiplier));
 
     _input            = input;
     _output           = output;
@@ -158,6 +171,13 @@ void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, c
     INEKernel::configure(win);
 }
 
+Status NEDepthwiseIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, depth_multiplier));
+    return Status{};
+}
+
 void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 86a6d1c1a8..fe141bef56 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -34,8 +34,27 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 using namespace arm_compute;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = compute_vector_to_tensor_output_shape(input->tensor_shape(), conv_w, conv_h, output->data_layout());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
 
 template <typename T>
 void NEDepthwiseVectorToTensorKernel::vector_to_tensor(const Window &window)
@@ -76,19 +95,13 @@ NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel()
 
 void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-
-    TensorShape output_shape = input->info()->tensor_shape();
-    output_shape.set(0, conv_w);
-    output_shape.set(1, conv_h);
-    output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
+    TensorShape output_shape = compute_vector_to_tensor_output_shape(input->info()->tensor_shape(), conv_w, conv_h, output->info()->data_layout());
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), conv_w, conv_h));
 
     _input     = input;
     _output    = output;
@@ -121,6 +134,13 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o
     INEKernel::configure(win);
 }
 
+Status NEDepthwiseVectorToTensorKernel::validate(const ITensorInfo *input, const ITensorInfo *output, size_t conv_w, size_t conv_h)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, conv_w, conv_h));
+    return Status{};
+}
+
 void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 47fcf12874..2c7a379c25 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -77,6 +77,24 @@ void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output,
     },
     in, out);
 }
+
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()) && (biases != nullptr));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(0) * input->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+
+    if(biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != input->dimension(2));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    return Status{};
+}
 } // namespace
 
 NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
@@ -86,18 +104,9 @@ NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
 
 void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *output, const ITensor *biases)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
-    ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
-    ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(biases != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        ARM_COMPUTE_ERROR_ON(biases->info()->dimension(0) != input->info()->dimension(2));
-        ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() > 1);
-    }
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), (biases != nullptr) ? biases->info() : nullptr));
 
     _input  = input;
     _output = output;
@@ -135,6 +144,13 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
     INEKernel::configure(win);
 }
 
+Status NEDepthwiseWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *biases)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, biases));
+    return Status{};
+}
+
 void NEDepthwiseWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 8588f43edf..238786953b 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -39,6 +39,43 @@
 
 using namespace arm_compute;
 
+namespace
+{
+Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != DataType::F32));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
+    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(DataLayoutDimension::HEIGHT) != output->dimension(DataLayoutDimension::HEIGHT));
+    ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(DataLayoutDimension::WIDTH) != output->dimension(DataLayoutDimension::WIDTH));
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    const unsigned int num_elems_read_per_iteration = 16 / input0->element_size();
+
+    Window win = calculate_max_window(*input0, Steps(num_elems_read_per_iteration));
+
+    AccessWindowHorizontal input0_access(input0, 0, num_elems_read_per_iteration);
+    AccessWindowHorizontal input1_access(input1, 0, num_elems_read_per_iteration);
+    AccessWindowStatic     output_access(output, 0, 0, output->dimension(0), output->dimension(1));
+
+    bool window_changed = update_window_and_padding(win, input0_access, input1_access, output_access);
+
+    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 template <typename I0, typename I1, typename O>
 void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out)
 {
@@ -175,10 +212,9 @@ BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
 
 void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
-    ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0 = input0;
     _input1 = input1;
@@ -203,17 +239,17 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
     const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
     _border_size                = BorderSize(0, border_x);
 
-    Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
-
-    AccessWindowHorizontal input0_access(input0->info(), 0, num_elems_read_per_iteration);
-    AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_read_per_iteration);
-    AccessWindowStatic     output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1));
-
-    update_window_and_padding(win, input0_access, input1_access, output_access);
-
-    _output->info()->set_valid_region(ValidRegion(Coordinates(), _output->info()->tensor_shape()));
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    INEKernel::configure(win);
+Status NEGEMMMatrixVectorMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
+    return Status{};
 }
 
 void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 1d65dde2a6..3b54ed62c7 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -123,6 +123,16 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
     }
 }
 
+Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                                unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_UNUSED(biases);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    return NEDepthwiseConvolutionLayer3x3Kernel::validate(input, weights, output, conv_info, depth_multiplier);
+}
+
 void NEDepthwiseConvolutionLayer3x3::run()
 {
     if(_is_first_run && _is_optimized)
@@ -263,6 +273,58 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
     _v2mm_output.allocator()->allocate();
 }
 
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+                                             unsigned int depth_multiplier)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+
+    const bool         is_quantized = is_data_type_quantized_asymmetric(input->data_type());
+    const bool         append_bias  = (biases != nullptr) && !is_quantized;
+    const TensorShape  output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+    const size_t       weights_w    = weights->dimension(0);
+    const size_t       weights_h    = weights->dimension(1);
+    const size_t       weights_z    = weights->dimension(2);
+    const unsigned int conv_w       = output_shape.x();
+    const unsigned int conv_h       = output_shape.y();
+    const size_t       patch_size   = weights_w * weights_h + (append_bias ? 1 : 0);
+    const size_t       conv_size    = conv_w * conv_h;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+
+    // Im2Col configuration
+    TensorShape shape_im2col = input->tensor_shape();
+    shape_im2col.set(0, patch_size);
+    shape_im2col.set(1, conv_size);
+    shape_im2col.set(2, weights_z);
+    TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+
+    // Weights reshape configuration
+    const TensorShape shape_weights_reshape(patch_size, weights_z);
+    TensorInfo        weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+
+    // GEMV configuration
+    DataType    v2mm_dt        = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
+    TensorShape shape_v2mm_out = input->tensor_shape();
+    shape_v2mm_out.set(0, conv_size * weights_z);
+    shape_v2mm_out.set(1, 1);
+    shape_v2mm_out.set(2, 1);
+    TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
+
+    TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+
+    if(is_quantized)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+    }
+
+    return Status{};
+}
+
 void NEDepthwiseConvolutionLayer::run()
 {
     prepare();
-- 
cgit v1.2.1