From eb65f6da695ac0d3e495817145cceb1c4de4f048 Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Wed, 15 Apr 2020 11:42:15 +0100
Subject: COMPMID-3304: Update OpenCL GEMM heuristic for Int8

Change-Id: I6b7ff678d8d0437a1639db2ff602ea1cdb155464
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3056
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 .../CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h    | 101 ---------------------
 .../kernels/CLGEMMLowpOffsetContributionKernel.h   |   8 +-
 ...CLGEMMLowpOffsetContributionOutputStageKernel.h |   8 +-
 ...CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h |   2 +-
 .../CLGEMMLowpQuantizeDownInt32ScaleKernel.h       |   2 +-
 ...antizeDownInt32ToInt16ScaleByFixedPointKernel.h |   2 +-
 ...uantizeDownInt32ToInt8ScaleByFixedPointKernel.h |   2 +-
 ...antizeDownInt32ToUint8ScaleByFixedPointKernel.h |   2 +-
 8 files changed, 13 insertions(+), 114 deletions(-)
 delete mode 100644 arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h

(limited to 'arm_compute/core/CL/kernels')

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index e926f5ed36..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note This kernel should be used ONLY for Midgard architectures
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-    /** Initialise the kernel's input and output.
-     *
-     * @note This kernel should be used ONLY for Midgard architectures
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  gemm_info       (Optional) GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel
-     *
-     * @param[in] input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-     * @param[in] input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index d7266b2805..f9ec558d85 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
  * and adds to it the offset contribution of matrix A and matrix B in-place.
  *
  * The final result is:
@@ -58,7 +58,7 @@ public:
     CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in, out] mm_result      Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
      *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -73,7 +73,7 @@ public:
     /** Initialise the kernel's input and output.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
      *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index 02ed20e5af..032539b699 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and perform the output stage.
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
  * of matrix A and matrix B and performs the output stage defined by the output_stage argument
  *
  * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
@@ -52,7 +52,7 @@ public:
     CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mm_result          Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
      *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
@@ -74,7 +74,7 @@ public:
     /** Initialise the kernel's input and output.
      *
      * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
      *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 0b5b22cafc..dd85d8a97c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -33,7 +33,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 0d7d1c3390..f36076dfa2 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Add offset terms to final result
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 2845d9259e..36cd7bf693 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index a768b6fba0..fd95e00d5d 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index e319c32c78..1714a02f76 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
-- 
cgit v1.2.1