COMPMID-3304: Update OpenCL GEMM heuristic for Int8

Change-Id: I6b7ff678d8d0437a1639db2ff602ea1cdb155464 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3056 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2020-04-15 11:42:15 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2020-04-20 13:04:42 +0000
commit: eb65f6da695ac0d3e495817145cceb1c4de4f048 (patch)
tree: 1e4980ba6d6ce2d738670c2ebadf4e24ebd172ce /arm_compute
parent: 47a899017e67556ffffef78571c9be61dd7bc3f0 (diff)
download: ComputeLibrary-eb65f6da695ac0d3e495817145cceb1c4de4f048.tar.gz
21 files changed, 74 insertions, 189 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 583cf270e2..cd26399390 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -73,7 +73,6 @@
 #include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLFloorKernel.h"
 #include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
index fced41b261..a6341e5094 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
 #include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
 #include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
 
 #include <memory>
@@ -49,12 +50,11 @@ public:
         switch(get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
+                return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationMidgard>(gpu);
             case GPUTarget::BIFROST:
                 return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(gpu);
-                break;
             case GPUTarget::VALHALL:
                 return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationValhall>(gpu);
-                break;
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
         }
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
index 29b8e08a80..5b2abe6f0f 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
      * @param[in] gpu GPU target
      */
     CLGEMMNativeKernelConfigurationBifrost(GPUTarget gpu);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMNativeKernelConfigurationBifrost(const CLGEMMNativeKernelConfigurationBifrost &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMNativeKernelConfigurationBifrost &operator=(const CLGEMMNativeKernelConfigurationBifrost &) = delete;
-    /** Default Move Constructor. */
-    CLGEMMNativeKernelConfigurationBifrost(CLGEMMNativeKernelConfigurationBifrost &&) = default;
-    /** Default move assignment operator */
-    CLGEMMNativeKernelConfigurationBifrost &operator=(CLGEMMNativeKernelConfigurationBifrost &&) = default;
 
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
new file mode 100644
index 0000000000..0e95a15613
--- /dev/null
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Midgard based OpenCL GEMMNative configuration */
+class CLGEMMNativeKernelConfigurationMidgard final : public ICLGEMMKernelConfiguration
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
index f6a61a24b8..e739997b3a 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
      * @param[in] gpu GPU target
      */
     CLGEMMNativeKernelConfigurationValhall(GPUTarget gpu);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMNativeKernelConfigurationValhall(const CLGEMMNativeKernelConfigurationValhall &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMNativeKernelConfigurationValhall &operator=(const CLGEMMNativeKernelConfigurationValhall &) = delete;
-    /** Default Move Constructor. */
-    CLGEMMNativeKernelConfigurationValhall(CLGEMMNativeKernelConfigurationValhall &&) = default;
-    /** Default move assignment operator */
-    CLGEMMNativeKernelConfigurationValhall &operator=(CLGEMMNativeKernelConfigurationValhall &&) = default;
 
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index e960d64964..10dc9aefdb 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -51,10 +51,8 @@ public:
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
                 return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(gpu);
-                break;
             case GPUTarget::VALHALL:
                 return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationValhall>(gpu);
-                break;
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
         }
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index c6ece758b9..55742e3e56 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
      * @param[in] gpu GPU target
      */
     CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedKernelConfigurationBifrost(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedKernelConfigurationBifrost &operator=(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
-    /** Default Move Constructor. */
-    CLGEMMReshapedKernelConfigurationBifrost(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
-    /** Default move assignment operator */
-    CLGEMMReshapedKernelConfigurationBifrost &operator=(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
 
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
index 0dd2a2c38f..e65974144d 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
      * @param[in] gpu GPU target
      */
     CLGEMMReshapedKernelConfigurationValhall(GPUTarget gpu);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedKernelConfigurationValhall(const CLGEMMReshapedKernelConfigurationValhall &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedKernelConfigurationValhall &operator=(const CLGEMMReshapedKernelConfigurationValhall &) = delete;
-    /** Default Move Constructor. */
-    CLGEMMReshapedKernelConfigurationValhall(CLGEMMReshapedKernelConfigurationValhall &&) = default;
-    /** Default move assignment operator */
-    CLGEMMReshapedKernelConfigurationValhall &operator=(CLGEMMReshapedKernelConfigurationValhall &&) = default;
 
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
index 683e39f3c1..7909726164 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -51,10 +51,8 @@ public:
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
                 return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(gpu);
-                break;
             case GPUTarget::VALHALL:
                 return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationValhall>(gpu);
-                break;
             default:
                 ARM_COMPUTE_ERROR("Not supported GPU target");
         }
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
index ff351b6a06..044bdc7b18 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
      * @param[in] gpu GPU target
      */
     CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
-    /** Default Move Constructor. */
-    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
-    /** Default move assignment operator */
-    CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
 
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
index 7541802776..6dba6fdb00 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
      * @param[in] gpu GPU target
      */
     CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu);
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedOnlyRHSKernelConfigurationValhall(const CLGEMMReshapedOnlyRHSKernelConfigurationValhall &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapedOnlyRHSKernelConfigurationValhall &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationValhall &) = delete;
-    /** Default Move Constructor. */
-    CLGEMMReshapedOnlyRHSKernelConfigurationValhall(CLGEMMReshapedOnlyRHSKernelConfigurationValhall &&) = default;
-    /** Default move assignment operator */
-    CLGEMMReshapedOnlyRHSKernelConfigurationValhall &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationValhall &&) = default;
 
     // Inherited overridden method
     std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index e926f5ed36..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
-{
-public:
-    /** Default Constructor */
-    CLGEMMLowpMatrixMultiplyKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
-    /** Initialise the kernel's input and output.
-     *
-     * @note This kernel should be used ONLY for Midgard architectures
-     *
-     * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-     * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-    /** Initialise the kernel's input and output.
-     *
-     * @note This kernel should be used ONLY for Midgard architectures
-     *
-     * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input0          Input tensor containing the LHS matrix. Data type supported: QASYMM8
-     * @param[in]  input1          Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[out] output          Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in]  gemm_info       (Optional) GEMM information used to retrieve the original dimensions of the input matrices
-     */
-    void configure(CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel
-     *
-     * @param[in] input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-     * @param[in] input1    Input tensor containing the RHS matrix. Data type supported: same as @p input0
-     * @param[in] output    Output tensor to store the result of matrix multiplication. Data type supported: S32
-     * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
-    // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input0;
-    const ICLTensor *_input1;
-    ICLTensor       *_output;
-    bool             _slide_matrix_b;
-    bool             _reinterpret_input_as_3d;
-    bool             _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index d7266b2805..f9ec558d85 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
  * and adds to it the offset contribution of matrix A and matrix B in-place.
  *
  * The final result is:
@@ -58,7 +58,7 @@ public:
     CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in, out] mm_result      Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in, out] mm_result      Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]      vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
      *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]      vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -73,7 +73,7 @@ public:
     /** Initialise the kernel's input and output.
      *
      * @param[in]      compile_context The compile context to be used.
-     * @param[in, out] mm_result       Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
      *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index 02ed20e5af..032539b699 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and perform the output stage.
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
  * of matrix A and matrix B and performs the output stage defined by the output_stage argument
  *
  * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
@@ -52,7 +52,7 @@ public:
     CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mm_result          Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
      *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
@@ -74,7 +74,7 @@ public:
     /** Initialise the kernel's input and output.
      *
      * @param[in]  compile_context    The compile context to be used.
-     * @param[in]  mm_result          Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
      * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
      *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
      * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 0b5b22cafc..dd85d8a97c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -33,7 +33,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 0d7d1c3390..f36076dfa2 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Add offset terms to final result
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 2845d9259e..36cd7bf693 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index a768b6fba0..fd95e00d5d 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index e319c32c78..1714a02f76 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
 
 /** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
  *
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
  * The following computations will be performed by the kernel:
  *
  *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index f7b4ebd9b4..48697af35f 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -34,6 +34,8 @@ enum class CLGEMMKernelType
      * @note This variant will be deprecated in favor of a new and configurable NATIVE variant
      */
     NATIVE_V1,
+    /** Native GEMM kernel with configurable block size.*/
+    NATIVE,
     /** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Fixed block size fixed.
      * @note Temporary variant to keep compatibility with the old implementation.
      * @note This variant will be deprecated in favor of RESHAPED
@@ -48,11 +50,11 @@ enum class CLGEMMKernelType
 /** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
 struct CLGEMMKernelSelectionParams
 {
-    unsigned int m{ 0 };                        /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
-    unsigned int n{ 0 };                        /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
-    unsigned int k{ 0 };                        /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
-    bool         is_rhs_constant{ false };      /**< True if the content of the rhs matrix is constant */
-    DataType     data_type{DataType::UNKNOWN};  /**< Data type */
+    unsigned int m{ 0 };                         /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+    unsigned int n{ 0 };                         /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+    unsigned int k{ 0 };                         /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+    bool         is_rhs_constant{ false };       /**< True if the content of the rhs matrix is constant */
+    DataType     data_type{ DataType::UNKNOWN }; /**< Data type */
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_RUNTIME_CLTYPES_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index c9b1b70c54..b147001820 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -25,7 +25,6 @@
 #define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
 
 #include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
@@ -41,18 +40,7 @@ namespace arm_compute
 class IMemoryManager;
 class ICLTensor;
 
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLGEMMReshapeRHSMatrixKernel  (if the output tensor is a matrix)
- *  -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of GEMMInfo is FALSE)
- *  -# @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel (if the parameter "reshape_b_only_on_first_run" of GEMMInfo is TRUE)
- *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
- *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- *  -# @ref CLGEMMLowpOffsetContributionKernel (if gemm_info.gemmlowp_output_stage == NONE)
- *  -# @ref CLGEMMLowpOffsetContributionOutputStageKernel (if gemm_info.gemmlowp_output_stage != NONE)
- *  -# @ref CLDepthConvertLayerKernel
- *
-*/
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
 class CLGEMMLowpMatrixMultiplyCore : public IFunction
 {
 public:
@@ -106,7 +94,6 @@ private:
 
     // Kernels used
     CLDepthConvertLayerKernel                     _weights_to_qasymm8;
-    CLGEMMLowpMatrixMultiplyKernel                _mm_midgard_kernel;
     CLGEMMLowpMatrixMultiplyNativeKernel          _mm_native_kernel;
     CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
     CLGEMMReshapeRHSMatrixKernel                  _mtx_b_reshape_kernel;
@@ -132,7 +119,6 @@ private:
     int32_t _a_offset;
     int32_t _b_offset;
     bool    _is_gemm_reshaped;
-    bool    _is_midgard;
     bool    _reshape_b_only_on_first_run;
     bool    _is_prepared;
     bool    _run_output_stage;
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2020-04-15 11:42:15 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2020-04-20 13:04:42 +0000
commit	eb65f6da695ac0d3e495817145cceb1c4de4f048 (patch)
tree	1e4980ba6d6ce2d738670c2ebadf4e24ebd172ce /arm_compute
parent	47a899017e67556ffffef78571c9be61dd7bc3f0 (diff)
download	ComputeLibrary-eb65f6da695ac0d3e495817145cceb1c4de4f048.tar.gz