From 358ca205c9e41f523517ffa55a9057308b736040 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 7 Dec 2017 16:47:52 +0000
Subject: COMPMID-617: Adds CLFullyConnectionLayer validation support

Change-Id: I4d2eb9872a3165fdcaa7784596e441cbe563dbc2
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/112577
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Ioan-Cristian Szabo <ioan-cristian.szabo@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 .../core/CL/kernels/CLGEMMInterleave4x4Kernel.h    |   8 ++
 .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h    |  10 ++
 .../kernels/CLGEMMLowpOffsetContributionKernel.h   |  13 +++
 .../core/CL/kernels/CLGEMMLowpReductionKernel.h    |  16 ++++
 .../kernels/CLGEMMMatrixAccumulateBiasesKernel.h   |   9 ++
 .../core/CL/kernels/CLGEMMMatrixMultiplyKernel.h   |  12 +++
 .../core/CL/kernels/CLGEMMTranspose1xWKernel.h     |   8 ++
 arm_compute/core/CL/kernels/CLIm2ColKernel.h       |  15 ++-
 arm_compute/core/utils/misc/ShapeCalculator.h      | 104 +++++++++++++++++++++
 .../runtime/CL/functions/CLFullyConnectedLayer.h   |  20 ++++
 .../CL/functions/CLGEMMLowpMatrixMultiplyCore.h    |  11 +++
 .../NEON/functions/NEGEMMLowpMatrixMultiplyCore.h  |  10 +-
 12 files changed, 230 insertions(+), 6 deletions(-)
 create mode 100644 arm_compute/core/utils/misc/ShapeCalculator.h

(limited to 'arm_compute')

diff --git a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
index c87fb2cd66..2520eff5de 100644
--- a/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMInterleave4x4Kernel.h
@@ -68,6 +68,14 @@ public:
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMInterleave4x4Kernel
+     *
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
index b60b80618c..3ad3ced003 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
@@ -61,6 +61,16 @@ public:
      * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
      */
     void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, bool is_interleaved_transposed = true);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel
+     *
+     * @param[in] input0                    Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8
+     * @param[in] input1                    Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[in] output                    Output tensor info to store the result of matrix multiplication. Data type supported: S32
+     * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed = true);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index 5f2e025687..871b97c1d7 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -68,6 +68,19 @@ public:
      * @param[in]      b_offset       Offset to be added to each element of the matrix B.
      */
     void configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
+     *
+     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32
+     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
+     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
+     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] a_offset       Offset to be added to each element of the matrix A.
+     * @param[in] b_offset       Offset to be added to each element of the matrix B.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
index aa0583fe81..12c12ef99a 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h
@@ -71,6 +71,14 @@ public:
      * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      */
     void configure(const ICLTensor *mtx_a, ICLTensor *vector_sum_row) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixAReductionKernel
+     *
+     * @param[in] mtx_a          Input tensor. Data type supported: QASYMM8
+     * @param[in] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -90,6 +98,14 @@ public:
      * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      */
     void configure(const ICLTensor *mtx_b, ICLTensor *vector_sum_col) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixBReductionKernel
+     *
+     * @param[in] mtx_b          Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
index 9348ff8ca8..2956f93cdc 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -50,6 +50,15 @@ public:
      * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
      */
     void configure(ICLTensor *accum, const ICLTensor *biases);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixAccumulateBiasesKernel
+     *
+     * @param[in] accum      The accumulate tensor to convert. Data types supported: QS8/QS16/F16/F32
+     * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types supported: Same as @p input
+     * @param[in] gpu_target GPU target
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
index 5af9091416..4e73d7eb13 100644
--- a/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMMatrixMultiplyKernel.h
@@ -60,6 +60,18 @@ public:
      * @param[in]  is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
      */
     void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed = true);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMMatrixMultiplyKernel
+     *
+     * @param[in] input0                    Input tensor containing the Matrix A. Data types supported: QS8/QS16/F16/F32
+     * @param[in] input1                    Input tensor containing the Matrix B. Data type supported: same as @p input0
+     * @param[in] output                    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0
+     * @param[in] alpha                     Weight of the matrix product
+     * @param[in] is_interleaved_transposed True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
+     * @param[in] gpu_target                GPU Target
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, GPUTarget gpu_target);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
index 8a37720462..8721643c1e 100644
--- a/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMTranspose1xWKernel.h
@@ -74,6 +74,14 @@ public:
      * @param[out] output Output tensor. Data type supported: same as @p input
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMTranspose1xWKernel
+     *
+     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] output Output tensor. Data type supported: same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index 1d8b5500c1..88de1ba002 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -69,7 +69,7 @@ public:
     /** Set the input and output of the kernel.
      *
      * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                         while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32
      * @param[out] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
      *                         while every dimension above represents a batch. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
@@ -80,6 +80,19 @@ public:
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
+    /** Static function to check if given info will lead to a valid configuration of @ref CLIm2ColKernel
+     *
+     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                        while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QASYMM8/QS16/F16/F32
+     * @param[in] output      The output tensor. First 2 lower dimensions represent a transform of each 3D input,
+     *                        while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in] kernel_dims The kernel dimensions (width and height).
+     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias);
 
 private:
     /** Run the reshape kernel optimised for the special case (stride is 1, padding is 0 and kernel's low 3 dimensions are same as input)
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
new file mode 100644
index 0000000000..52773faa3a
--- /dev/null
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H__
+#define __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H__
+
+#include "arm_compute/core/ITensorInfo.h"
+
+namespace arm_compute
+{
+namespace misc
+{
+namespace shape_calculator
+{
+inline TensorShape compute_interleaved_shape(const ITensorInfo &a)
+{
+    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
+    TensorShape shape_interleaved_a{ a.tensor_shape() };
+    shape_interleaved_a.set(0, a.dimension(0) * 4);
+    shape_interleaved_a.set(1, std::ceil(a.dimension(1) / 4.f));
+
+    return shape_interleaved_a;
+}
+inline TensorShape compute_transpose1xW_shape(const ITensorInfo &b)
+{
+    // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
+    TensorShape shape_transposed1xW_b{ b.tensor_shape() };
+    shape_transposed1xW_b.set(0, b.dimension(1) * 16);
+    shape_transposed1xW_b.set(1, std::ceil(b.dimension(0) / 16.f));
+
+    return shape_transposed1xW_b;
+}
+inline TensorShape compute_transpose1xW_with_element_size_shape(const ITensorInfo &b)
+{
+    // The transpose1xW output matrix will have the following shape:
+    // [ b_height * (16 / element_size), ceil(b_width / (16.0f / element_size) ]
+    TensorShape  shape_transposed1xW_b{ b.tensor_shape() };
+    const size_t transpose_width = 16 / b.element_size();
+    shape_transposed1xW_b.set(0, b.dimension(1) * transpose_width);
+    shape_transposed1xW_b.set(1, static_cast<size_t>(std::ceil(b.dimension(0) / static_cast<float>(transpose_width))));
+
+    return shape_transposed1xW_b;
+}
+inline TensorShape compute_reductionA_shape(const ITensorInfo &b)
+{
+    TensorShape shape_vector_sum_col{ b.tensor_shape() };
+    if(shape_vector_sum_col.num_dimensions() > 1)
+    {
+        shape_vector_sum_col.remove_dimension(1);
+    }
+
+    return shape_vector_sum_col;
+}
+inline TensorShape compute_reductionB_shape(const ITensorInfo &a)
+{
+    TensorShape shape_vector_sum_row{ a.tensor_shape() };
+    shape_vector_sum_row.set(Window::DimX, a.dimension(1));
+    if(a.num_dimensions() > 1)
+    {
+        shape_vector_sum_row.remove_dimension(1);
+    }
+
+    return shape_vector_sum_row;
+}
+inline TensorShape compute_im2col_shape(const ITensorInfo &input)
+{
+    TensorShape shape_im2col{ input.tensor_shape() };
+    shape_im2col.collapse(3);
+
+    return shape_im2col;
+}
+inline TensorShape compute_transposed_shape(const ITensorInfo &input)
+{
+    TensorShape shape_transposed{ input.tensor_shape() };
+
+    shape_transposed.set(0, input.dimension(1));
+    shape_transposed.set(1, input.dimension(0));
+
+    return shape_transposed;
+}
+} // namespace shape_calculator
+} // namespace misc
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_MISC_SHAPE_CALCULATOR_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
index 2cac06c1c9..1e9ee492ad 100644
--- a/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
+++ b/arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h
@@ -52,6 +52,14 @@ public:
      * @param[out] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
      */
     void configure(const ICLTensor *input, ICLTensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayerReshapeWeights
+     *
+     * @param[in] input  Weights tensor. The weights must be 2 dimensional. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] output Destination tensor which stores the transposed input tensor. Data type supported: Same as @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 };
 
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
@@ -78,6 +86,18 @@ public:
      * @param[in]  are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
      */
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, bool transpose_weights = true, bool are_weights_reshaped = false);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLFullyConnectedLayer
+     *
+     * @param[in] input                Source tensor. Data type supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] weights              Weights tensor. The weights must be 2 dimensional. Data type supported: Same as @p input
+     * @param[in] biases               Bias tensor. It can be nullptr. Data type supported:Same as @p input.
+     * @param[in] output               Destination tensor. Data type supported: Same as @p input.
+     * @param[in] transpose_weights    (Optional) Transpose weights if true. Defaults to true.
+     * @param[in] are_weights_reshaped (Optional) Reshape the weights tensor if false. Defaults to false.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose_weights = true, bool are_weights_reshaped = false);
 
     //Inherited methods override
     void run() override;
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index e316144548..3976704907 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -69,6 +69,17 @@ public:
      *                       if the reshape of matrix B should be executed only for the first run
      */
     void configure(const ICLTensor *a, const ICLTensor *b, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
+     *
+     * @param[in] a         First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in] b         Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[in] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                      if the reshape of matrix B should be executed only for the first run
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index 46e6b494f8..eddb3a26b7 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -75,11 +75,11 @@ public:
     void configure(const ITensor *a, const ITensor *b, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
-     * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
-     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[out] output    Output tensor. Data type supported: Data type supported: S32
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should be executed only for the first run
+     * @param[in] a         First input tensor  (Matrix A). Data type supported: QASYMM8.
+     * @param[in] b         Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[in] output    Output tensor. Data type supported: Data type supported: S32
+     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                      if the reshape of matrix B should be executed only for the first run
      *
      * @return a status
      */
-- 
cgit v1.2.1