aboutsummaryrefslogtreecommitdiff
path: root/arm_compute
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2020-04-15 11:42:15 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2020-04-20 13:04:42 +0000
commiteb65f6da695ac0d3e495817145cceb1c4de4f048 (patch)
tree1e4980ba6d6ce2d738670c2ebadf4e24ebd172ce /arm_compute
parent47a899017e67556ffffef78571c9be61dd7bc3f0 (diff)
downloadComputeLibrary-eb65f6da695ac0d3e495817145cceb1c4de4f048.tar.gz
COMPMID-3304: Update OpenCL GEMM heuristic for Int8
Change-Id: I6b7ff678d8d0437a1639db2ff602ea1cdb155464 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3056 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'arm_compute')
-rw-r--r--arm_compute/core/CL/CLKernels.h1
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h4
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h8
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h51
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h2
-rw-r--r--arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h2
-rw-r--r--arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h8
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h101
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h8
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h8
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h2
-rw-r--r--arm_compute/runtime/CL/CLTypes.h12
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h16
21 files changed, 74 insertions, 189 deletions
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 583cf270e2..cd26399390 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -73,7 +73,6 @@
#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
index fced41b261..a6341e5094 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
#include <memory>
@@ -49,12 +50,11 @@ public:
switch(get_arch_from_target(gpu))
{
case GPUTarget::MIDGARD:
+ return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationMidgard>(gpu);
case GPUTarget::BIFROST:
return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(gpu);
- break;
case GPUTarget::VALHALL:
return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationValhall>(gpu);
- break;
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
}
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
index 29b8e08a80..5b2abe6f0f 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMNativeKernelConfigurationBifrost(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationBifrost(const CLGEMMNativeKernelConfigurationBifrost &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationBifrost &operator=(const CLGEMMNativeKernelConfigurationBifrost &) = delete;
- /** Default Move Constructor. */
- CLGEMMNativeKernelConfigurationBifrost(CLGEMMNativeKernelConfigurationBifrost &&) = default;
- /** Default move assignment operator */
- CLGEMMNativeKernelConfigurationBifrost &operator=(CLGEMMNativeKernelConfigurationBifrost &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
new file mode 100644
index 0000000000..0e95a15613
--- /dev/null
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Midgard based OpenCL GEMMNative configuration */
+class CLGEMMNativeKernelConfigurationMidgard final : public ICLGEMMKernelConfiguration
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
index f6a61a24b8..e739997b3a 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMNativeKernelConfigurationValhall(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationValhall(const CLGEMMNativeKernelConfigurationValhall &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationValhall &operator=(const CLGEMMNativeKernelConfigurationValhall &) = delete;
- /** Default Move Constructor. */
- CLGEMMNativeKernelConfigurationValhall(CLGEMMNativeKernelConfigurationValhall &&) = default;
- /** Default move assignment operator */
- CLGEMMNativeKernelConfigurationValhall &operator=(CLGEMMNativeKernelConfigurationValhall &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index e960d64964..10dc9aefdb 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -51,10 +51,8 @@ public:
case GPUTarget::MIDGARD:
case GPUTarget::BIFROST:
return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(gpu);
- break;
case GPUTarget::VALHALL:
return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationValhall>(gpu);
- break;
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
}
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index c6ece758b9..55742e3e56 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationBifrost(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationBifrost &operator=(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedKernelConfigurationBifrost(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedKernelConfigurationBifrost &operator=(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
index 0dd2a2c38f..e65974144d 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedKernelConfigurationValhall(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationValhall(const CLGEMMReshapedKernelConfigurationValhall &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationValhall &operator=(const CLGEMMReshapedKernelConfigurationValhall &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedKernelConfigurationValhall(CLGEMMReshapedKernelConfigurationValhall &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedKernelConfigurationValhall &operator=(CLGEMMReshapedKernelConfigurationValhall &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
index 683e39f3c1..7909726164 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -51,10 +51,8 @@ public:
case GPUTarget::MIDGARD:
case GPUTarget::BIFROST:
return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(gpu);
- break;
case GPUTarget::VALHALL:
return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationValhall>(gpu);
- break;
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
}
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
index ff351b6a06..044bdc7b18 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
index 7541802776..6dba6fdb00 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall(const CLGEMMReshapedOnlyRHSKernelConfigurationValhall &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationValhall &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall(CLGEMMReshapedOnlyRHSKernelConfigurationValhall &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationValhall &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index e926f5ed36..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- * -# Convert a values from int8 to int32
- * -# Convert b values from int8 to int32
- * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMLowpMatrixMultiplyKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
- /** Initialise the kernel's input and output.
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index d7266b2805..f9ec558d85 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
* and adds to it the offset contribution of matrix A and matrix B in-place.
*
* The final result is:
@@ -58,7 +58,7 @@ public:
CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -73,7 +73,7 @@ public:
/** Initialise the kernel's input and output.
*
* @param[in] compile_context The compile context to be used.
- * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index 02ed20e5af..032539b699 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and perform the output stage.
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
* of matrix A and matrix B and performs the output stage defined by the output_stage argument
*
* @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
@@ -52,7 +52,7 @@ public:
CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -74,7 +74,7 @@ public:
/** Initialise the kernel's input and output.
*
* @param[in] compile_context The compile context to be used.
- * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 0b5b22cafc..dd85d8a97c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -33,7 +33,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 0d7d1c3390..f36076dfa2 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Add offset terms to final result
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 2845d9259e..36cd7bf693 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index a768b6fba0..fd95e00d5d 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index e319c32c78..1714a02f76 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index f7b4ebd9b4..48697af35f 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -34,6 +34,8 @@ enum class CLGEMMKernelType
* @note This variant will be deprecated in favor of a new and configurable NATIVE variant
*/
NATIVE_V1,
+ /** Native GEMM kernel with configurable block size.*/
+ NATIVE,
/** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Fixed block size fixed.
* @note Temporary variant to keep compatibility with the old implementation.
* @note This variant will be deprecated in favor of RESHAPED
@@ -48,11 +50,11 @@ enum class CLGEMMKernelType
/** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
struct CLGEMMKernelSelectionParams
{
- unsigned int m{ 0 }; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
- unsigned int n{ 0 }; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
- unsigned int k{ 0 }; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
- bool is_rhs_constant{ false }; /**< True if the content of the rhs matrix is constant */
- DataType data_type{DataType::UNKNOWN}; /**< Data type */
+ unsigned int m{ 0 }; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+ unsigned int n{ 0 }; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+ unsigned int k{ 0 }; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+ bool is_rhs_constant{ false }; /**< True if the content of the rhs matrix is constant */
+ DataType data_type{ DataType::UNKNOWN }; /**< Data type */
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_RUNTIME_CLTYPES_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index c9b1b70c54..b147001820 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
@@ -41,18 +40,7 @@ namespace arm_compute
class IMemoryManager;
class ICLTensor;
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMReshapeRHSMatrixKernel (if the output tensor is a matrix)
- * -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of GEMMInfo is FALSE)
- * -# @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel (if the parameter "reshape_b_only_on_first_run" of GEMMInfo is TRUE)
- * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
- * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- * -# @ref CLGEMMLowpOffsetContributionKernel (if gemm_info.gemmlowp_output_stage == NONE)
- * -# @ref CLGEMMLowpOffsetContributionOutputStageKernel (if gemm_info.gemmlowp_output_stage != NONE)
- * -# @ref CLDepthConvertLayerKernel
- *
-*/
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
class CLGEMMLowpMatrixMultiplyCore : public IFunction
{
public:
@@ -106,7 +94,6 @@ private:
// Kernels used
CLDepthConvertLayerKernel _weights_to_qasymm8;
- CLGEMMLowpMatrixMultiplyKernel _mm_midgard_kernel;
CLGEMMLowpMatrixMultiplyNativeKernel _mm_native_kernel;
CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
CLGEMMReshapeRHSMatrixKernel _mtx_b_reshape_kernel;
@@ -132,7 +119,6 @@ private:
int32_t _a_offset;
int32_t _b_offset;
bool _is_gemm_reshaped;
- bool _is_midgard;
bool _reshape_b_only_on_first_run;
bool _is_prepared;
bool _run_output_stage;