From a3b1b469276b10484cd45901ae3a4b48b506caa9 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 16 Nov 2017 19:24:39 +0000
Subject: COMPMID-667: Add validation static method to NEON GEMMlowp

Change-Id: I8a470cc1351593ad8eeaf4ec92e04865e83d4f3c
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96147
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 arm_compute/core/ITensorInfo.h                     |   4 +-
 .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h  |  12 +-
 .../NEON/kernels/NEGEMMInterleaveBlockedKernel.h   |  11 ++
 .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h  |  11 +-
 .../kernels/NEGEMMLowpOffsetContributionKernel.h   |  11 ++
 ...NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h |  11 ++
 .../core/NEON/kernels/NEGEMMLowpReductionKernel.h  |  20 +++
 .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h   |  10 +-
 .../kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h    |  10 ++
 arm_compute/core/SubTensorInfo.h                   |   3 +-
 arm_compute/core/TensorInfo.h                      |   3 +-
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  |   9 ++
 .../runtime/NEON/functions/NEGEMMLowpOutputStage.h |  11 ++
 .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp     |  76 ++++++++---
 .../NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp |  92 +++++++++----
 .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp     |  79 +++++++----
 .../kernels/NEGEMMLowpOffsetContributionKernel.cpp | 151 ++++++++++++---------
 ...GEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp | 101 +++++++++-----
 .../NEON/kernels/NEGEMMLowpReductionKernel.cpp     | 107 +++++++++++----
 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp |  92 +++++++++----
 .../kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp  |  60 ++++++--
 .../functions/NEGEMMLowpMatrixMultiplyCore.cpp     |  88 +++++++++++-
 .../NEON/functions/NEGEMMLowpOutputStage.cpp       |   5 +
 tests/validation/NEON/GEMMLowp.cpp                 |  33 +++++
 24 files changed, 765 insertions(+), 245 deletions(-)

diff --git a/arm_compute/core/ITensorInfo.h b/arm_compute/core/ITensorInfo.h
index 1bc0a80bac..4f69442b48 100644
--- a/arm_compute/core/ITensorInfo.h
+++ b/arm_compute/core/ITensorInfo.h
@@ -201,8 +201,10 @@ public:
     /** Set the flag whether the tensor size can be changed.
      *
      * @param[in] is_resizable Flag that marks the tensor if it can be changed or not.
+     *
+     * @return Reference to this ITensorInfo object
      */
-    virtual void set_is_resizable(bool is_resizable) = 0;
+    virtual ITensorInfo &set_is_resizable(bool is_resizable) = 0;
     /** Valid region of the tensor. All elements in the valid region have defined values, i.e. are not undefined.
      *
      * @return The valid region.
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 1c0d85c27b..db719caccb 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -56,10 +56,18 @@ public:
     NEGEMMInterleave4x4Kernel();
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     *
+     * @return an error status
+     */
+    static Error validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -67,7 +75,7 @@ public:
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h
index cdeb11d606..1a5b0fb863 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.h
@@ -50,6 +50,17 @@ public:
      * @param[in]  transpose    True if transpose operation must be performed, false otherwise.
      */
     void configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleaveBlockedKernel
+     *
+     * @param[in] input        Input tensor. Data types supported: U8
+     * @param[in] output       Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
+     * @param[in] block_height The height of the blocks to be interleaved.
+     * @param[in] block_width  The width of the blocks to be interleaved.
+     * @param[in] transpose    True if transpose operation must be performed, false otherwise.
+     *
+     * @return an error status
+     */
+    Error validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index e9bfe4ea07..d9986b6cdd 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -58,11 +58,20 @@ public:
      * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
      * kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: ASYMM8
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
      * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
      * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
      */
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
+     *
+     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
+     *
+     * @return an error status
+     */
+    static Error validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 8c1bae9396..27cb3f2c1c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -68,6 +68,17 @@ public:
      * @param[in]      b_offset       Offset to be added to each element of the matrix B.
      */
     void configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpOffsetContributionKernel
+     *
+     * @param[in] mm_result      Input tensor containing the result of @ref NEGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] a_offset       Offset to be added to each element of the matrix A.
+     * @param[in] b_offset       Offset to be added to each element of the matrix B.
+     */
+    static Error validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index 4ec0e9df93..654dee21af 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -70,6 +70,17 @@ public:
     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
     void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+     *
+    * @param[in] input  Input tensor. Data type supported: S32
+    * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+    *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+    * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+    * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+    * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+    *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+     */
+    static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 6eee54a9f0..9ca5cdf828 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -77,6 +77,16 @@ public:
      * @param[in]  is_interleaved4x4 True if the matrix A has been interleaved4x4
      */
     void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
+     *
+     * @param[in] mtx_a             Input tensor. Data type supported: QASYMM8
+     * @param[in] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in] num_mtx_a_cols    Number of matrix A columns
+     * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4
+     *
+     * @return an error status
+     */
+    static Error validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -98,6 +108,16 @@ public:
      * @param[in]  is_transposed1xW True if the input tensor is transposed 1xW
      */
     void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
+     *
+     * @param[in] mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in] num_mtx_b_rows   Number of matrix B rows
+     * @param[in] is_transposed1xW True if the input tensor is transposed 1xW
+     *
+     * @return an error status
+     */
+    static Error validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index 4d0bb2a484..4436d1fdb0 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -70,10 +70,18 @@ class NEGEMMTranspose1xWKernel : public INESimpleKernel
 public:
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output Output tensor info. Data type supported: same as @p input.
+     *
+     * @return an error status
+     */
+    static Error validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
diff --git a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
index 26af626aaa..66684a1185 100644
--- a/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
+++ b/arm_compute/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.h
@@ -39,6 +39,16 @@ class NEGEMMLowpAArch64V8P4Kernel : public NEGEMMAssemblyBaseKernel
 public:
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMAssemblyBaseKernel
+     *
+     * The computed function is C = a * AxB + b * C.
+     *
+     * @param[in] input0 Input tensor info containing the Matrix A. Data types supported: QASYMM8
+     * @param[in] input1 Input tensor info containing the Matrix B. Data types supported: same as @p input0
+     * @param[in] output Output tensor info to store the result of matrix multiplication.
+     *                        If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: S32
+     */
+    static Error validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output);
 
 protected:
     void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1) override;
diff --git a/arm_compute/core/SubTensorInfo.h b/arm_compute/core/SubTensorInfo.h
index b8a36854dc..7c464c0b17 100644
--- a/arm_compute/core/SubTensorInfo.h
+++ b/arm_compute/core/SubTensorInfo.h
@@ -185,10 +185,11 @@ public:
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         return _parent->is_resizable();
     }
-    void set_is_resizable(bool is_resizable) override
+    ITensorInfo &set_is_resizable(bool is_resizable) override
     {
         ARM_COMPUTE_ERROR_ON(_parent == nullptr);
         _parent->set_is_resizable(is_resizable);
+        return *this;
     }
     ValidRegion valid_region() const override
     {
diff --git a/arm_compute/core/TensorInfo.h b/arm_compute/core/TensorInfo.h
index 5fd6c47818..80ef7f8d5a 100644
--- a/arm_compute/core/TensorInfo.h
+++ b/arm_compute/core/TensorInfo.h
@@ -280,9 +280,10 @@ public:
     {
         return _is_resizable;
     }
-    void set_is_resizable(bool is_resizable) override
+    ITensorInfo &set_is_resizable(bool is_resizable) override
     {
         _is_resizable = is_resizable;
+        return *this;
     }
     ValidRegion valid_region() const override
     {
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 0c441df4b9..598756e435 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -71,6 +71,15 @@ public:
     * @param[out] output Output tensor. Data type supported: Data type supported: S32
     */
     void configure(const ITensor *a, const ITensor *b, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
+     *
+     * @param[in]  a      First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in]  b      Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[out] output Output tensor. Data type supported: Data type supported: S32
+     *
+     * @return an error status
+     */
+    static Error validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index a3db23aaee..9270d5581f 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -73,6 +73,17 @@ public:
     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
     */
     void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale
+    *
+    * @param[in] input  Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+    * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+    *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+    * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+    * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+    * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+    *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
+    */
+    static Error validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index a29b661a00..1f4d9b176e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -40,6 +40,50 @@ using namespace arm_compute;
 
 namespace
 {
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+
+    if(output->total_size() != 0)
+    {
+        TensorShape output_shape = input->tensor_shape();
+        output_shape.set(0, input->dimension(0) * 4);
+        output_shape.set(1, std::ceil(input->dimension(1) / 4.0f));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    unsigned int           num_elems_processed_per_iteration_x = (input->element_size() == 1) ? 8 : 4;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+    bool                   window_changed                      = false;
+
+    // Configure kernel window
+    Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+
 void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window)
 {
     const size_t in_stride = input->info()->strides_in_bytes()[1];
@@ -132,10 +176,7 @@ NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel()
 
 void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
-                                                  DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     TensorShape output_shape = input->info()->tensor_shape();
     output_shape.set(0, input->info()->dimension(0) * 4);
@@ -144,21 +185,16 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
 
     _input  = input;
     _output = output;
 
-    unsigned int           num_elems_processed_per_iteration_x = 4;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
     switch(input->info()->element_size())
     {
         case 1:
-            num_elems_processed_per_iteration_x = 8;
-            _func                               = &gemm_interleave_8bit_elements;
+            _func = &gemm_interleave_8bit_elements;
             break;
         case 2:
             _func = &gemm_interleave_16bit_elements;
@@ -172,15 +208,17 @@ void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output)
     }
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f);
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    update_window_and_padding(win, output_access, input_access);
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+Error NEGEMMInterleave4x4Kernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void NEGEMMInterleave4x4Kernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
index 2a4a46e76c..e971dcba8e 100644
--- a/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleaveBlockedKernel.cpp
@@ -40,6 +40,60 @@ using namespace arm_compute;
 
 namespace
 {
+TensorShape get_output_shape(const ITensorInfo *input, unsigned int block_height)
+{
+    TensorShape output_shape      = input->tensor_shape();
+    const float interleave_by_f32 = block_height;
+    output_shape.set(0, input->dimension(0) * interleave_by_f32);
+    output_shape.set(1, std::ceil(static_cast<float>(input->dimension(1)) / interleave_by_f32));
+    return output_shape;
+}
+
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_width, unsigned int block_height)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, block_height));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, unsigned int block_width, unsigned int block_height)
+{
+    const unsigned int num_elems_processed_per_iteration_x = block_width;
+    const unsigned int num_elems_processed_per_iteration_y = block_height;
+    bool               window_changed                      = false;
+
+    // Configure kernel window
+    Window      win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    const float scaley_factor = 1.f / block_height;
+
+    AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
+
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowRectangle output_access(output,
+                                            0, 0,
+                                            num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y,
+                                            1, num_elems_processed_per_iteration_y, scaley_factor);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, input->valid_region());
+    }
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+
 inline void gemm_interleave_blocked_transposed_8bit(const ITensor *input, ITensor *output, const Window &window, unsigned int block_width, unsigned int block_height)
 {
     const size_t in_stride = input->info()->strides_in_bytes()[1];
@@ -122,20 +176,13 @@ NEGEMMInterleaveBlockedKernel::NEGEMMInterleaveBlockedKernel()
 
 void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *output, unsigned int block_height, unsigned int block_width, bool transpose)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_ERROR_ON_MSG(block_height < 1, "Block height must be greater than 0");
-    ARM_COMPUTE_ERROR_ON_MSG(block_width < 1, "Block window must be greater than 0");
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape      = input->info()->tensor_shape();
-    const float interleave_by_f32 = block_height;
-    output_shape.set(0, input->info()->dimension(0) * interleave_by_f32);
-    output_shape.set(1, std::ceil(static_cast<float>(input->info()->dimension(1)) / interleave_by_f32));
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    auto_init_if_empty(*output->info(), get_output_shape(input->info(), block_height), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_width, block_height));
 
     _input        = input;
     _output       = output;
@@ -143,20 +190,19 @@ void NEGEMMInterleaveBlockedKernel::configure(const ITensor *input, ITensor *out
     _block_width  = block_width;
     _transpose    = transpose;
 
-    const unsigned int num_elems_processed_per_iteration_x = block_width;
-    const unsigned int num_elems_processed_per_iteration_y = block_height;
-
     // Configure kernel window
-    Window      win           = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    const float scaley_factor = 1.f / interleave_by_f32;
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, num_elems_processed_per_iteration_y, scaley_factor);
-    AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-    update_window_and_padding(win, output_access, input_access);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), block_width, block_height);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    output_access.set_valid_region(win, input->info()->valid_region());
+Error NEGEMMInterleaveBlockedKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int block_height, unsigned int block_width, bool transpose)
+{
+    ARM_COMPUTE_UNUSED(transpose);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_width, block_height));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), block_width, block_height).first);
 
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void NEGEMMInterleaveBlockedKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 5f052f797d..2bc251e91f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -45,6 +45,48 @@ namespace arm_compute
 class Coordinates;
 } // namespace arm_compute
 
+namespace
+{
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    TensorShape in0_shape = input0->tensor_shape();
+    TensorShape in1_shape = input1->tensor_shape();
+    TensorShape out_shape = output->tensor_shape();
+
+    in0_shape.collapse(2);
+    in1_shape.collapse(2);
+    out_shape.collapse(2);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
+    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic     in0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), input0->dimension(1));
+    AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);
+    AccessWindowRectangle  output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+
+    bool window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
     : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)
 {
@@ -52,42 +94,29 @@ NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()
 
 void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
-    // Check if matrix B should be slidden or not
-    // Don't slide matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
-    // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    TensorShape in0_shape = input0->info()->tensor_shape();
     TensorShape in1_shape = input1->info()->tensor_shape();
-    TensorShape out_shape = output->info()->tensor_shape();
-
-    in0_shape.collapse(2);
     in1_shape.collapse(2);
-    out_shape.collapse(2);
-
-    ARM_COMPUTE_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-    ARM_COMPUTE_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
 
     _input0         = input0;
     _input1         = input1;
     _output         = output;
     _slide_matrix_b = in1_shape[2] != 1;
 
-    constexpr unsigned int num_elems_processed_per_iteration_x = 16;
-    constexpr unsigned int num_elems_processed_per_iteration_y = 4;
-
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-    AccessWindowStatic     in0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), input0->info()->dimension(1));
-    AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x);
-    AccessWindowRectangle  output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win, in0_access, in1_access, output_access);
+Error NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, size_t out_stride, const Window &window)
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
index bd550db54c..62f4014acb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.cpp
@@ -44,106 +44,131 @@ namespace arm_compute
 class Coordinates;
 } // namespace arm_compute
 
-NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
+namespace
 {
-}
-
-void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+Error validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+                         int32_t a_offset, int32_t b_offset)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON(vector_sum_col->info()->dimension(0) != mm_result->info()->dimension(0));
-
-        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
-        vector_sum_col_shape.collapse(1);
-
-        // Check if vector_sum_col_shape should be slidden or not
-        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
-        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
     if(b_offset != 0)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
-        ARM_COMPUTE_ERROR_ON(vector_sum_row->info()->dimension(0) != mm_result->info()->dimension(1));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->dimension(0) != mm_result->dimension(1));
 
-        TensorShape output_shape         = mm_result->info()->tensor_shape();
-        TensorShape vector_sum_row_shape = vector_sum_row->info()->tensor_shape();
+        TensorShape output_shape         = mm_result->tensor_shape();
+        TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
         vector_sum_row_shape.collapse(1);
         output_shape.collapse(2);
 
-        ARM_COMPUTE_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2], "mm_result tensor must have the same number of batches of output tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[2],
+                                        "mm_result tensor must have the same number of batches of output tensor");
 
         if(a_offset != 0)
         {
-            TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape();
+            TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
             vector_sum_col_shape.collapse(1);
 
-            ARM_COMPUTE_ERROR_ON_MSG(vector_sum_col_shape[1] != 1
-                                     && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                     "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                            "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
         }
     }
 
-    _vector_sum_col = vector_sum_col;
-    _vector_sum_row = vector_sum_row;
-    _mm_result      = mm_result;
-    _a_offset       = a_offset;
-    _b_offset       = b_offset;
-    _k_offset       = a_offset * b_offset * k;
+    return Error{};
+}
 
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+                                                       int32_t a_offset, int32_t b_offset)
+{
     constexpr unsigned int num_elems_processed_per_iteration = 16;
+    bool                   window_changed                    = false;
 
     // Configure kernel window
-    Window win = calculate_max_window(*mm_result->info(), Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowHorizontal mm_result_access(mm_result->info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win,
+                                                                 mm_result_access);
 
-    // Accordingly with a_offset and b_offset, we can have 4 cases:
-    // a_offset != 0 && b_offset != 0
-    // a_offset  = 0 && b_offset != 0
-    // a_offset != 0 && b_offset  = 0
-    // a_offset  = 0 && b_offset  = 0
-    if(a_offset != 0 && b_offset != 0)
+    if(a_offset != 0)
     {
-        AccessWindowStatic     vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
-
-        update_window_and_padding(win,
-                                  vector_sum_col_access,
-                                  vector_sum_row_access,
-                                  mm_result_access);
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win,
+                                                                     vector_sum_col_access);
     }
-    else if(a_offset == 0 && b_offset != 0)
+    if(b_offset != 0)
     {
-        AccessWindowStatic vector_sum_row_access(vector_sum_row->info(), 0, 0, vector_sum_row->info()->dimension(0), 0);
-
-        update_window_and_padding(win,
-                                  vector_sum_row_access,
-                                  mm_result_access);
+        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+        window_changed = window_changed || update_window_and_padding(win,
+                                                                     vector_sum_row_access);
     }
-    else if(a_offset != 0 && b_offset == 0)
-    {
-        AccessWindowHorizontal vector_sum_col_access(vector_sum_col->info(), 0, num_elems_processed_per_iteration);
 
-        update_window_and_padding(win,
-                                  vector_sum_col_access,
-                                  mm_result_access);
-    }
-    else
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMLowpOffsetContributionKernel::NEGEMMLowpOffsetContributionKernel()
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _a_offset(0), _b_offset(0), _k_offset(0), _slide_vector_sum_col(true)
+{
+}
+
+void NEGEMMLowpOffsetContributionKernel::configure(ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+                                                  a_offset, b_offset));                                         // NOLINT
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+    _a_offset       = a_offset;
+    _b_offset       = b_offset;
+    _k_offset       = a_offset * b_offset * k;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
     {
-        update_window_and_padding(win,
-                                  mm_result_access);
+        TensorShape vector_sum_col_shape = vector_sum_col->info()->tensor_shape(); // NOLINT
+        vector_sum_col_shape.collapse(1);
+
+        // Check if vector_sum_col_shape should be slidden or not
+        // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
+        // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
+        _slide_vector_sum_col = vector_sum_col_shape[1] != 1;
     }
 
-    INEKernel::configure(win);
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(mm_result->info(),
+                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr, // NOLINT
+                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr, // NOLINT
+                                                    a_offset, b_offset);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+                                                   int32_t a_offset, int32_t b_offset)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              a_offset, b_offset)
+                                .first); // NOLINT
+
+    return Error{};
 }
 
 void NEGEMMLowpOffsetContributionKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 26aaa2a9d5..670b11fe67 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -40,6 +40,50 @@ using namespace arm_compute;
 
 namespace
 {
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max);
+
+    // Check biases if exist
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    input_access,
+                                                    output_result_access);
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+
 inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
 {
     // Add the offset terms to GEMM's result
@@ -185,17 +229,13 @@ NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint
 
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON(max > 255);
-    ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
-
-    if(bias != nullptr)
-    {
-        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
-        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
-        ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
-    }
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
+                                                  (bias != nullptr) ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  min,
+                                                  max));
 
     _input           = input;
     _bias            = bias;
@@ -206,34 +246,25 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *inp
     _min             = min;
     _max             = max;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-    AccessWindowHorizontal output_result_access(output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_result_access);
-
-    if(bias != nullptr)
-    {
-        AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
-
-        update_window_and_padding(win,
-                                  bias_access);
-    }
-
-    output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 
+    // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+    _func                      = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
+}
 
-    // Check if we need to clamp the result using min and max
-    _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
+Error NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                              (bias != nullptr) ? bias->clone().get() : nullptr,
+                                                              output->clone().get())
+                                .first);
+
+    return Error{};
 }
 
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index 81d9b5bb81..a8395a15cb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -44,6 +44,59 @@ namespace arm_compute
 class Coordinates;
 } // namespace arm_compute
 
+namespace
+{
+Error validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    return Error{};
+}
+std::pair<Error, Window> validate_and_configure_window_matrix_a_reduction(ITensorInfo *input, ITensorInfo *output, bool is_reshaped)
+{
+    const unsigned int num_elems_processed_per_iteration = is_reshaped ? 4 : 1;
+
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+
+Error validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window_matrix_b_reduction(ITensorInfo *input, ITensorInfo *output)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+    bool window_changed = update_window_and_padding(win, input_access, output_access);
+
+    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
     : _input(), _output(), _k(0), _is_reshaped(false)
 {
@@ -51,29 +104,28 @@ INEGEMMLowpReductionKernel::INEGEMMLowpReductionKernel()
 
 void NEGEMMLowpMatrixAReductionKernel::configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a->info(), vector_sum_row->info()));
 
     _input       = mtx_a;
     _output      = vector_sum_row;
     _k           = num_mtx_a_cols;
     _is_reshaped = is_interleaved4x4;
 
-    const unsigned int num_elems_processed_per_iteration = _is_reshaped ? 4 : 1;
-
     // Configure kernel window
-    Window win = calculate_max_window(*_output->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_access);
+    auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info(), _is_reshaped);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+Error NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4)
+{
+    ARM_COMPUTE_UNUSED(num_mtx_a_cols);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_a_reduction(mtx_a->clone().get(), vector_sum_row->clone().get(), is_interleaved4x4).first);
 
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
@@ -200,29 +252,28 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
 
 void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mtx_b, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b->info(), vector_sum_col->info()));
 
     _input       = mtx_b;
     _output      = vector_sum_col;
     _k           = num_mtx_b_rows;
     _is_reshaped = is_transposed1xW;
 
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
     // Configure kernel window
-    Window win = calculate_max_window(*vector_sum_col->info(), Steps(num_elems_processed_per_iteration));
-
-    AccessWindowStatic     input_access(_input->info(), 0, 0, ceil_to_multiple(_input->info()->dimension(0), 16), _input->info()->dimension(1));
-    AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration);
-
-    update_window_and_padding(win,
-                              input_access,
-                              output_access);
+    auto win_config = validate_and_configure_window_matrix_b_reduction(_input->info(), _output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), _output->info()->tensor_shape()));
+Error NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
+{
+    ARM_COMPUTE_UNUSED(num_mtx_b_rows);
+    ARM_COMPUTE_UNUSED(is_transposed1xW);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_matrix_b_reduction(mtx_b->clone().get(), vector_sum_col->clone().get()).first);
 
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 7f83144e12..7d79c664d1 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -41,45 +41,87 @@
 
 using namespace arm_compute;
 
-void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *input)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
-                                                  DataType::F16,
-                                                  DataType::F32);
-    ARM_COMPUTE_ERROR_ON_NULLPTR(output);
+    TensorShape  output_shape{ input->tensor_shape() };
+    const size_t transpose_w = 16 / input->element_size();
+    output_shape.set(0, input->dimension(1) * transpose_w);
+    output_shape.set(1, static_cast<size_t>(std::ceil((input->dimension(0) / static_cast<float>(transpose_w)))));
+    return output_shape;
+}
 
-    TensorShape  output_shape{ input->info()->tensor_shape() };
-    const size_t transpose_w = 16 / input->info()->element_size();
-    output_shape.set(0, input->info()->dimension(1) * transpose_w);
-    output_shape.set(1, static_cast<size_t>(std::ceil((input->info()->dimension(0) / static_cast<float>(transpose_w)))));
+Error validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::U8, DataType::S8,
+                                                         DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
 
-    // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    }
 
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+    return Error{};
+}
 
-    const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
     const int          scale_x                           = num_elems_processed_per_iteration;
-
-    _input  = input;
-    _output = output;
+    bool               window_changed                    = false;
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     ARM_COMPUTE_ERROR_ON_MSG((win.x().end() / scale_x) == 0, "Transposed shape would be 0 in the second dimension");
 
-    AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, input_access);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
-                              output_access);
+    // Configure window in case of configured output
+    if(output->total_size() != 0)
+    {
+        AccessWindowTranspose output_access(output, 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x);
+        window_changed = window_changed || update_window_and_padding(win, output_access);
+        output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->tensor_shape()));
+    }
 
-    output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), input->info()->tensor_shape()));
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), get_output_shape(input->info()), 1, input->info()->data_type(), input->info()->fixed_point_position());
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+    _input  = input;
+    _output = output;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(input->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
+
+Error NEGEMMTranspose1xWKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
 
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void NEGEMMTranspose1xWKernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
index ae711af89a..a025cac82d 100644
--- a/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
+++ b/src/core/NEON/kernels/arm64/NEGEMMLowpAArch64V8P4Kernel.cpp
@@ -48,13 +48,47 @@ namespace arm_compute
 // Enable only if compiled for AArch64-V8.2-A targets
 #ifdef ARM_COMPUTE_AARCH64_V8_2
 
+namespace
+{
+using namespace arm_compute;
+
+Error validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+
+    return Error{};
+}
+
+std::pair<Error, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)
+{
+    // Configure kernel window
+    Window win = calculate_max_window(*output);
+
+    AccessWindowRectangle output_access(output, 0, 0, 12, 8);
+
+    const int input0_access_end = ceil_to_multiple(input0->tensor_shape().x(), 8);
+    const int input1_access_end = ceil_to_multiple(input1->tensor_shape().x(), 12);
+
+    bool window_changed = update_window_and_padding(win,
+                                                    AccessWindowStatic(input0, 0, 0, input0_access_end, input0->tensor_shape().y()),
+                                                    AccessWindowStatic(input1, 0, 0, input1_access_end, input1->tensor_shape().y()),
+                                                    output_access);
+
+    Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
 namespace arm_compute
 {
 void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool transform_0, bool transform_1)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
 
     _input0      = input0;
     _input1      = input1;
@@ -66,19 +100,17 @@ void NEGEMMLowpAArch64V8P4Kernel::internal_configure(const ITensor *input0, cons
     _transform_1 = transform_1;
 
     // Configure kernel window
-    Window win = calculate_max_window(*output->info());
-
-    AccessWindowRectangle output_access(output->info(), 0, 0, 12, 8);
-
-    const int input0_access_end = ceil_to_multiple(input0->info()->tensor_shape().x(), 8);
-    const int input1_access_end = ceil_to_multiple(input1->info()->tensor_shape().x(), 12);
+    auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
+}
 
-    update_window_and_padding(win,
-                              AccessWindowStatic(input0->info(), 0, 0, input0_access_end, input0->info()->tensor_shape().y()),
-                              AccessWindowStatic(input1->info(), 0, 0, input1_access_end, input1->info()->tensor_shape().y()),
-                              output_access);
+Error NEGEMMLowpAArch64V8P4Kernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);
 
-    INEKernel::configure(win);
+    return Error{};
 }
 
 void NEGEMMLowpAArch64V8P4Kernel::run(const Window &window, const ThreadInfo &info)
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 0fff6c9ca1..92c911c370 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -54,12 +54,8 @@ NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
 
 void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
-    ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
-    ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCore::validate(a->info(), b->info(), output->info()));
 
     bool dot_product_path = false;
 
@@ -185,6 +181,86 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
     }
 }
 
+Error NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
+                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(1) != (output)->dimension(1),
+                                    "The output matrix must have the same number of rows as the matrix A");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((b)->dimension(0) != (output)->dimension(0),
+                                    "The output matrix must have the same number of columns as the matrix B");
+
+    int32_t a_offset = a->quantization_info().offset;
+    int32_t b_offset = b->quantization_info().offset;
+
+#ifdef ARM_COMPUTE_AARCH64_V8_2
+    // Check for DOT product instruction
+    const struct CPUInfo ci              = NEScheduler::get().cpu_info();
+    const int            cpu_has_dotprod = static_cast<int>(ci.CPU) & static_cast<int>(CPUTarget::DOT);
+
+    if(cpu_has_dotprod != 0)
+    {
+        // Validate matrix multiply kernel
+        ARM_COMPUTE_RETURN_ERROR_ON(NEGEMMLowpAArch64V8P4Kernel::validate(a, b, output));
+    }
+    else
+#endif /* ARM_COMPUTE_AARCH64_V8_2 */
+    {
+        // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+        TensorShape shape_tmp_a = a->tensor_shape();
+        shape_tmp_a.set(0, a->dimension(0) * 4);
+        shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
+
+        // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+        TensorShape shape_tmp_b = b->tensor_shape();
+        shape_tmp_b.set(0, b->dimension(1) * 16);
+        shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
+
+        TensorInfo info_a(shape_tmp_a, 1, a->data_type());
+        TensorInfo info_b(shape_tmp_b, 1, b->data_type());
+
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &info_a));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &info_b));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(&info_a, &info_b, output));
+    }
+
+    TensorInfo info_vector_sum_col, info_vector_sum_row;
+
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if(a_offset != 0)
+    {
+        TensorShape shape_vector_sum_col = b->tensor_shape();
+        shape_vector_sum_col.remove_dimension(1);
+        info_vector_sum_col = TensorInfo(shape_vector_sum_col, 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, a->dimension(0), false));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if(b_offset != 0)
+    {
+        TensorShape shape_vector_sum_row = a->tensor_shape();
+        shape_vector_sum_row.set(Window::DimX, a->dimension(1));
+        shape_vector_sum_row.remove_dimension(1);
+        info_vector_sum_row = TensorInfo(shape_vector_sum_row, 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+    }
+
+    // Validate offset contribution kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(output,
+                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
+                                                                             b_offset == 0 ? nullptr : &info_vector_sum_row,
+                                                                             a_offset, b_offset));
+
+    return Error{};
+}
+
 void NEGEMMLowpMatrixMultiplyCore::run()
 {
     _memory_group.acquire();
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 66cdf58634..ed51291e95 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -34,4 +34,9 @@ void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, co
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
     k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
     _kernel = std::move(k);
+}
+
+Error NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+{
+    return NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::validate(input, bias, output, min, max);
 }
\ No newline at end of file
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 078096a0dd..1418578a51 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -113,6 +113,39 @@ DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::c
     gemmlowp_mm.configure(&a, &b, &c);
 }
 
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
+    framework::dataset::make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4
+                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::QS8, 2),                                 // Mismatching data type
+                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
+                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions
+                                             TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),
+                                          }),
+    framework::dataset::make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                            TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),
+                                          })),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),
+                                            TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),
+                                           })),
+    framework::dataset::make("Expected", { true, true, true, true, false })),
+    a_info, b_info, output_info, expected)
+{
+    // Lock tensors
+    Error error =  NEGEMMLowpMatrixMultiplyCore::validate(&a_info.clone()->set_is_resizable(false),
+                                                          &b_info.clone()->set_is_resizable(false),
+                                                          &output_info.clone()->set_is_resizable(false));
+    ARM_COMPUTE_EXPECT(bool(error) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
-- 
cgit v1.2.1