aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2020-04-15 11:42:15 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2020-04-20 13:04:42 +0000
commiteb65f6da695ac0d3e495817145cceb1c4de4f048 (patch)
tree1e4980ba6d6ce2d738670c2ebadf4e24ebd172ce
parent47a899017e67556ffffef78571c9be61dd7bc3f0 (diff)
downloadComputeLibrary-eb65f6da695ac0d3e495817145cceb1c4de4f048.tar.gz
COMPMID-3304: Update OpenCL GEMM heuristic for Int8
Change-Id: I6b7ff678d8d0437a1639db2ff602ea1cdb155464 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3056 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp2
-rw-r--r--arm_compute/core/CL/CLKernels.h1
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h4
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h8
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h51
-rw-r--r--arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h2
-rw-r--r--arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h2
-rw-r--r--arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h8
-rw-r--r--arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h8
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h101
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h8
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h8
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h2
-rw-r--r--arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h2
-rw-r--r--arm_compute/runtime/CL/CLTypes.h12
-rw-r--r--arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h16
-rw-r--r--docs/00_introduction.dox2
-rw-r--r--src/core/CL/CLKernelLibrary.cpp1
-rw-r--r--src/core/CL/cl_kernels/gemmlowp.cl285
-rw-r--r--src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp45
-rw-r--r--src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp75
-rw-r--r--src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp7
-rw-r--r--src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp10
-rw-r--r--src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp5
-rw-r--r--src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp20
-rw-r--r--src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp15
-rw-r--r--src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp307
-rw-r--r--src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp110
-rw-r--r--src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp20
-rw-r--r--src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp5
-rw-r--r--src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp13
37 files changed, 283 insertions, 902 deletions
diff --git a/Android.bp b/Android.bp
index 7a04eec29f..b53c46482a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -68,6 +68,7 @@ cc_library_static {
"src/core/CL/OpenCL.cpp",
"src/core/CL/gemm/CLGEMMHelpers.cpp",
"src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp",
+ "src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp",
"src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp",
"src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp",
"src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp",
@@ -121,7 +122,6 @@ cc_library_static {
"src/core/CL/kernels/CLFlattenLayerKernel.cpp",
"src/core/CL/kernels/CLFloorKernel.cpp",
"src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp",
- "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp",
"src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp",
"src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.cpp",
"src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp",
diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h
index 583cf270e2..cd26399390 100644
--- a/arm_compute/core/CL/CLKernels.h
+++ b/arm_compute/core/CL/CLKernels.h
@@ -73,7 +73,6 @@
#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLFloorKernel.h"
#include "arm_compute/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
index fced41b261..a6341e5094 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h
@@ -26,6 +26,7 @@
#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h"
#include <memory>
@@ -49,12 +50,11 @@ public:
switch(get_arch_from_target(gpu))
{
case GPUTarget::MIDGARD:
+ return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationMidgard>(gpu);
case GPUTarget::BIFROST:
return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationBifrost>(gpu);
- break;
case GPUTarget::VALHALL:
return support::cpp14::make_unique<CLGEMMNativeKernelConfigurationValhall>(gpu);
- break;
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
}
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
index 29b8e08a80..5b2abe6f0f 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMNativeKernelConfigurationBifrost(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationBifrost(const CLGEMMNativeKernelConfigurationBifrost &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationBifrost &operator=(const CLGEMMNativeKernelConfigurationBifrost &) = delete;
- /** Default Move Constructor. */
- CLGEMMNativeKernelConfigurationBifrost(CLGEMMNativeKernelConfigurationBifrost &&) = default;
- /** Default move assignment operator */
- CLGEMMNativeKernelConfigurationBifrost &operator=(CLGEMMNativeKernelConfigurationBifrost &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
new file mode 100644
index 0000000000..0e95a15613
--- /dev/null
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+#define ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H
+
+#include "arm_compute/core/CL/ICLGEMMKernelConfiguration.h"
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+/** Midgard based OpenCL GEMMNative configuration */
+class CLGEMMNativeKernelConfigurationMidgard final : public ICLGEMMKernelConfiguration
+{
+public:
+ /** Constructor
+ *
+ * @param[in] gpu GPU target
+ */
+ CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu);
+
+ // Inherited overridden method
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace cl_gemm
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMNATIVEKERNELCONFIGURATIONMIDGARD_H */
diff --git a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
index f6a61a24b8..e739997b3a 100644
--- a/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMNativeKernelConfigurationValhall(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationValhall(const CLGEMMNativeKernelConfigurationValhall &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMNativeKernelConfigurationValhall &operator=(const CLGEMMNativeKernelConfigurationValhall &) = delete;
- /** Default Move Constructor. */
- CLGEMMNativeKernelConfigurationValhall(CLGEMMNativeKernelConfigurationValhall &&) = default;
- /** Default move assignment operator */
- CLGEMMNativeKernelConfigurationValhall &operator=(CLGEMMNativeKernelConfigurationValhall &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
index e960d64964..10dc9aefdb 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfiguration.h
@@ -51,10 +51,8 @@ public:
case GPUTarget::MIDGARD:
case GPUTarget::BIFROST:
return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationBifrost>(gpu);
- break;
case GPUTarget::VALHALL:
return support::cpp14::make_unique<CLGEMMReshapedKernelConfigurationValhall>(gpu);
- break;
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
}
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
index c6ece758b9..55742e3e56 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedKernelConfigurationBifrost(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationBifrost(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationBifrost &operator=(const CLGEMMReshapedKernelConfigurationBifrost &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedKernelConfigurationBifrost(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedKernelConfigurationBifrost &operator=(CLGEMMReshapedKernelConfigurationBifrost &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
index 0dd2a2c38f..e65974144d 100644
--- a/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedKernelConfigurationValhall(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationValhall(const CLGEMMReshapedKernelConfigurationValhall &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedKernelConfigurationValhall &operator=(const CLGEMMReshapedKernelConfigurationValhall &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedKernelConfigurationValhall(CLGEMMReshapedKernelConfigurationValhall &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedKernelConfigurationValhall &operator=(CLGEMMReshapedKernelConfigurationValhall &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
index 683e39f3c1..7909726164 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h
@@ -51,10 +51,8 @@ public:
case GPUTarget::MIDGARD:
case GPUTarget::BIFROST:
return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationBifrost>(gpu);
- break;
case GPUTarget::VALHALL:
return support::cpp14::make_unique<CLGEMMReshapedOnlyRHSKernelConfigurationValhall>(gpu);
- break;
default:
ARM_COMPUTE_ERROR("Not supported GPU target");
}
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
index ff351b6a06..044bdc7b18 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationBifrost &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
index 7541802776..6dba6fdb00 100644
--- a/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
+++ b/arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.h
@@ -39,14 +39,6 @@ public:
* @param[in] gpu GPU target
*/
CLGEMMReshapedOnlyRHSKernelConfigurationValhall(GPUTarget gpu);
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall(const CLGEMMReshapedOnlyRHSKernelConfigurationValhall &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall &operator=(const CLGEMMReshapedOnlyRHSKernelConfigurationValhall &) = delete;
- /** Default Move Constructor. */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall(CLGEMMReshapedOnlyRHSKernelConfigurationValhall &&) = default;
- /** Default move assignment operator */
- CLGEMMReshapedOnlyRHSKernelConfigurationValhall &operator=(CLGEMMReshapedOnlyRHSKernelConfigurationValhall &&) = default;
// Inherited overridden method
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
deleted file mode 100644
index e926f5ed36..0000000000
--- a/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- * -# Convert a values from int8 to int32
- * -# Convert b values from int8 to int32
- * -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernel : public ICLKernel
-{
-public:
- /** Default Constructor */
- CLGEMMLowpMatrixMultiplyKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyKernel(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- CLGEMMLowpMatrixMultiplyKernel &operator=(const CLGEMMLowpMatrixMultiplyKernel &) = delete;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyKernel(CLGEMMLowpMatrixMultiplyKernel &&) = default;
- /** Allow instances of this class to be moved */
- CLGEMMLowpMatrixMultiplyKernel &operator=(CLGEMMLowpMatrixMultiplyKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
- /** Initialise the kernel's input and output.
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * @param[in] compile_context The compile context to be used.
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
- */
- void configure(CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
- /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyKernel
- *
- * @param[in] input0 Input tensor containing the LHS matrix. Data type supported: QASYMM8
- * @param[in] input1 Input tensor containing the RHS matrix. Data type supported: same as @p input0
- * @param[in] output Output tensor to store the result of matrix multiplication. Data type supported: S32
- * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of the input matrices
- *
- * @return a status
- */
- static Status validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
- // Inherited methods overridden:
- void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
- const ICLTensor *_input0;
- const ICLTensor *_input1;
- ICLTensor *_output;
- bool _slide_matrix_b;
- bool _reinterpret_input_as_3d;
- bool _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNEL_H*/
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
index d7266b2805..f9ec558d85 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel),
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
* and adds to it the offset contribution of matrix A and matrix B in-place.
*
* The final result is:
@@ -58,7 +58,7 @@ public:
CLGEMMLowpOffsetContributionKernel &operator=(CLGEMMLowpOffsetContributionKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -73,7 +73,7 @@ public:
/** Initialise the kernel's input and output.
*
* @param[in] compile_context The compile context to be used.
- * @param[in, out] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index 02ed20e5af..032539b699 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
@@ -30,9 +30,9 @@ namespace arm_compute
{
class ICLTensor;
-/** OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and perform the output stage.
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
* of matrix A and matrix B and performs the output stage defined by the output_stage argument
*
* @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
@@ -52,7 +52,7 @@ public:
CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
@@ -74,7 +74,7 @@ public:
/** Initialise the kernel's input and output.
*
* @param[in] compile_context The compile context to be used.
- * @param[in] mm_result Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+ * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32
* @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
* Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
* @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
index 0b5b22cafc..dd85d8a97c 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -33,7 +33,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
index 0d7d1c3390..f36076dfa2 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ScaleKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Add offset terms to final result
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 2845d9259e..36cd7bf693 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** CL kernel used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QSYMM16 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QSYMM16 value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index a768b6fba0..fd95e00d5d 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8_SIGNED
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8_SIGNED value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8_SIGNED value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index e319c32c78..1714a02f76 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
@@ -32,7 +32,7 @@ class ICLTensor;
/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8
*
- * This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8 value.
* The following computations will be performed by the kernel:
*
* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
diff --git a/arm_compute/runtime/CL/CLTypes.h b/arm_compute/runtime/CL/CLTypes.h
index f7b4ebd9b4..48697af35f 100644
--- a/arm_compute/runtime/CL/CLTypes.h
+++ b/arm_compute/runtime/CL/CLTypes.h
@@ -34,6 +34,8 @@ enum class CLGEMMKernelType
* @note This variant will be deprecated in favor of a new and configurable NATIVE variant
*/
NATIVE_V1,
+ /** Native GEMM kernel with configurable block size.*/
+ NATIVE,
/** Reshaped GEMM kernel where both lhs and rhs matrices are reshaped. Fixed block size fixed.
* @note Temporary variant to keep compatibility with the old implementation.
* @note This variant will be deprecated in favor of RESHAPED
@@ -48,11 +50,11 @@ enum class CLGEMMKernelType
/** OpenCL GEMM kernel selection parameters. These information are retrieved to select the GEMM kernel on OpenCL */
struct CLGEMMKernelSelectionParams
{
- unsigned int m{ 0 }; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
- unsigned int n{ 0 }; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
- unsigned int k{ 0 }; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
- bool is_rhs_constant{ false }; /**< True if the content of the rhs matrix is constant */
- DataType data_type{DataType::UNKNOWN}; /**< Data type */
+ unsigned int m{ 0 }; /**< Number of rows for the lhs matrix. Lhs matrix NOT transposed */
+ unsigned int n{ 0 }; /**< Number of columns for the rhs matrix. Rhs matrix NOT transposed */
+ unsigned int k{ 0 }; /**< Number of rows for the rhs matrix. Rhs matrix NOT transposed */
+ bool is_rhs_constant{ false }; /**< True if the content of the rhs matrix is constant */
+ DataType data_type{ DataType::UNKNOWN }; /**< Data type */
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_RUNTIME_CLTYPES_H */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index c9b1b70c54..b147001820 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -25,7 +25,6 @@
#define ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H
#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.h"
@@ -41,18 +40,7 @@ namespace arm_compute
class IMemoryManager;
class ICLTensor;
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the following OpenCL kernels:
- *
- * -# @ref CLGEMMReshapeRHSMatrixKernel (if the output tensor is a matrix)
- * -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of GEMMInfo is FALSE)
- * -# @ref CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel (if the parameter "reshape_b_only_on_first_run" of GEMMInfo is TRUE)
- * -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
- * -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- * -# @ref CLGEMMLowpOffsetContributionKernel (if gemm_info.gemmlowp_output_stage == NONE)
- * -# @ref CLGEMMLowpOffsetContributionOutputStageKernel (if gemm_info.gemmlowp_output_stage != NONE)
- * -# @ref CLDepthConvertLayerKernel
- *
-*/
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
class CLGEMMLowpMatrixMultiplyCore : public IFunction
{
public:
@@ -106,7 +94,6 @@ private:
// Kernels used
CLDepthConvertLayerKernel _weights_to_qasymm8;
- CLGEMMLowpMatrixMultiplyKernel _mm_midgard_kernel;
CLGEMMLowpMatrixMultiplyNativeKernel _mm_native_kernel;
CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
CLGEMMReshapeRHSMatrixKernel _mtx_b_reshape_kernel;
@@ -132,7 +119,6 @@ private:
int32_t _a_offset;
int32_t _b_offset;
bool _is_gemm_reshaped;
- bool _is_midgard;
bool _reshape_b_only_on_first_run;
bool _is_prepared;
bool _run_output_stage;
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index 55c5fcb2b8..5585678863 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -997,7 +997,7 @@ v17.03.1 First Major public release of the sources
- New CPP target introduced for C++ kernels shared between NEON and CL functions.
- New padding calculation interface introduced and ported most kernels / functions to use it.
- New OpenCL kernels / functions:
- - @ref CLGEMMLowpMatrixMultiplyKernel / CLGEMMLowp
+ - CLGEMMLowpMatrixMultiplyKernel / CLGEMMLowp
- New NEON kernels / functions:
- @ref NENormalizationLayerKernel / @ref NENormalizationLayer
- @ref NETransposeKernel / @ref NETranspose
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 00e7b2bc5c..d4073c6f30 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -231,7 +231,6 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
{ "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
{ "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
- { "gemmlowp_mm_midgard", "gemmlowp.cl" },
{ "gemmlowp_mm_native", "gemmlowp.cl" },
{ "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
{ "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" },
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 127df063f6..d9625e7117 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -195,291 +195,6 @@
(n0, k0, a, b, c); \
})
-#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
-#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-#define VECTOR_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)
-/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- * -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- * -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- * -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- * -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- * (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8
- * @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr
- * @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)
- * @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)
- * @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32
- * @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)
- * @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)
- * @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),
- IMAGE_DECLARATION(src1),
- IMAGE_DECLARATION(dst),
- uint src0_stride_z,
- uint src1_stride_z,
- uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
- )
-{
- int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
-
- // Compute starting address for matrix A and Matrix B
- int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
- // Update address for the matrix A
- src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
-
- // Update address for the matrix B
- src_addr.s1 += idx;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
- // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible cross plane paddings
- //
- // | |
- // | plane0 |
- // | |
- // |__________________|
- // |******************|
- // | cross_plane_pad |
- // |******************|
- // | |
- // | plane1 |
- // | |
- // |__________________|
-
- // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
- uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;
- zin = min(DEPTH_GEMM3D - 1, zin);
-
- // Add offset due to the cross plane paddings
- zin *= (src_cross_plane_pad * src0_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply src0_stride_z by DEPTH_GEMM3D
- src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
- // Add offset for batched GEMM
- src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
- // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
- src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else // defined(MATRIX_B_DEPTH)
- src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
- int end_row_vec_a = src_addr.s0 + COLS_A;
-
- VECTOR_ACC_TYPE acc0 = 0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- VECTOR_ACC_TYPE acc1 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- VECTOR_ACC_TYPE acc2 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- VECTOR_ACC_TYPE acc3 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- VECTOR_ACC_TYPE acc4 = 0;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-
- for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))
- {
- // Load values from matrix A
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- VEC_DATA_TYPE(DATA_TYPE, 2)
- a4 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- // Load values from matrix B
- VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
- VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
- // Accumulate
- acc0 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a0.s0;
- acc0 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a0.s1;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc1 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a1.s0;
- acc1 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a1.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc2 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a2.s0;
- acc2 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a2.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc3 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a3.s0;
- acc3 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a3.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- acc4 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a4.s0;
- acc4 += CONVERT(b1, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a4.s1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- }
-
- for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))
- {
- // Load values from matrix A
- DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- DATA_TYPE a4 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 4 * src0_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- // Load values from matrix B
- VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-
- // Accumulate
- acc0 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a0;
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- acc1 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a1;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- acc2 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a2;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- acc3 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a3;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- acc4 += CONVERT(b0, VECTOR_ACC_TYPE) * (VECTOR_ACC_TYPE)a4;
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- }
-
- const int z = get_global_id(2);
-
- // Compute destination address
- Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
- // in order to take into account the presence of possible cross plane paddings
- //
- // | |
- // | plane0 |
- // | |
- // |__________________|
- // |******************|
- // | cross_plane_pad |
- // |******************|
- // | |
- // | plane1 |
- // | |
- // |__________________|
-
- // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D
- uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;
- zout = min(DEPTH_GEMM3D - 1, zout);
-
- // Add offset due to the cross plane paddings
- zout *= (dst_cross_plane_pad * dst_stride_y);
-
- // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
- // multiply dst_stride_z by DEPTH_GEMM3D
- dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;
-
- // Store the result
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
- // Add offset for batched GEMM
- dst.ptr += z * dst_stride_z;
-
- // Store the result
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
-#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
- VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)
- (CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));
-#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-}
-#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)
-
#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)
/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
index c4a9ccd703..c6b51c698a 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationBifrost.cpp
@@ -42,9 +42,6 @@ CLGEMMNativeKernelConfigurationBifrost::CLGEMMNativeKernelConfigurationBifrost(G
std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMNativeKernelConfigurationBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
{
- ARM_COMPUTE_ERROR_ON(data_type != DataType::F32 && data_type != DataType::QASYMM8);
- ARM_COMPUTE_UNUSED(data_type);
-
using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMNativeKernelConfigurationBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
unsigned int b);
@@ -52,31 +49,61 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMNativeKernelConfigurationB
static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G71 =
{
{ DataType::F32, &CLGEMMNativeKernelConfigurationBifrost::configure_G71_f32 },
- { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_G71_u8 }
+ { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_G71_u8 },
+ { DataType::QSYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_G71_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMNativeKernelConfigurationBifrost::configure_G71_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMNativeKernelConfigurationBifrost::configure_G71_u8 }
};
// Configurations for Mali-G76
static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G76 =
{
{ DataType::F32, &CLGEMMNativeKernelConfigurationBifrost::configure_G76_f32 },
- { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_G76_u8 }
+ { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QSYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMNativeKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMNativeKernelConfigurationBifrost::configure_G76_u8 }
};
// Default configurations
static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_default =
{
{ DataType::F32, &CLGEMMNativeKernelConfigurationBifrost::configure_default_f32 },
- { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_default_u8 }
+ { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_default_u8 },
+ { DataType::QSYMM8, &CLGEMMNativeKernelConfigurationBifrost::configure_default_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMNativeKernelConfigurationBifrost::configure_default_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMNativeKernelConfigurationBifrost::configure_default_u8 }
};
switch(_target)
{
case GPUTarget::G71:
- return (this->*gemm_configs_G71[data_type])(m, n, k, b);
+ if(gemm_configs_G71.find(data_type) != gemm_configs_G71.end())
+ {
+ return (this->*gemm_configs_G71[data_type])(m, n, k, b);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported data type");
+ }
case GPUTarget::G76:
- return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+ if(gemm_configs_G76.find(data_type) != gemm_configs_G76.end())
+ {
+ return (this->*gemm_configs_G76[data_type])(m, n, k, b);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported data type");
+ }
default:
- return (this->*gemm_configs_default[data_type])(m, n, k, b);
+ if(gemm_configs_default.find(data_type) != gemm_configs_default.end())
+ {
+ return (this->*gemm_configs_default[data_type])(m, n, k, b);
+ }
+ else
+ {
+ ARM_COMPUTE_ERROR("Not supported data type");
+ }
}
}
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
new file mode 100644
index 0000000000..86c056ffc2
--- /dev/null
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfigurationMidgard.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/gemm/CLGEMMHelpers.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include <map>
+#include <utility>
+
+namespace arm_compute
+{
+namespace cl_gemm
+{
+CLGEMMNativeKernelConfigurationMidgard::CLGEMMNativeKernelConfigurationMidgard(GPUTarget gpu)
+ : ICLGEMMKernelConfiguration(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMNativeKernelConfigurationMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+ using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMNativeKernelConfigurationMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
+ unsigned int b);
+
+ // Configurations for Midgard architectures
+ static std::map<DataType, ConfigurationFunctionExecutorPtr> default_configs =
+ {
+ { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationMidgard::default_q8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMNativeKernelConfigurationMidgard::default_q8 },
+ { DataType::QSYMM8, &CLGEMMNativeKernelConfigurationMidgard::default_q8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMNativeKernelConfigurationMidgard::default_q8 }
+ };
+
+ if(default_configs.find(data_type) != default_configs.end())
+ {
+ return (this->*default_configs[data_type])(m, n, k, b);
+ }
+ ARM_COMPUTE_ERROR("Not supported data type");
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMNativeKernelConfigurationMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+ ARM_COMPUTE_UNUSED(k);
+ ARM_COMPUTE_UNUSED(b);
+
+ const unsigned int m0 = std::min(m, static_cast<unsigned int>(4));
+ const unsigned int n0 = std::min(n, static_cast<unsigned int>(4));
+
+ return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false);
+}
+} // namespace cl_gemm
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
index 7cf0f0e1a8..c25cdac81a 100644
--- a/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/native/CLGEMMNativeKernelConfigurationValhall.cpp
@@ -45,12 +45,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMNativeKernelConfigurationV
using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (CLGEMMNativeKernelConfigurationValhall::*)(unsigned int m, unsigned int n, unsigned int k,
unsigned int b);
- // Configurations for Mali-G71
+ // Configurations for Mali-G77
static std::map<DataType, ConfigurationFunctionExecutorPtr> gemm_configs_G77 =
{
{ DataType::F32, &CLGEMMNativeKernelConfigurationValhall::configure_G77_f32 },
{ DataType::F16, &CLGEMMNativeKernelConfigurationValhall::configure_G77_f16 },
- { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationValhall::configure_G77_u8 }
+ { DataType::QASYMM8, &CLGEMMNativeKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QSYMM8, &CLGEMMNativeKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMNativeKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMNativeKernelConfigurationValhall::configure_G77_u8 }
};
switch(_target)
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
index 144c23a798..990cc72eb0 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationBifrost.cpp
@@ -49,7 +49,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
{
{ DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f32 },
{ DataType::F16, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedKernelConfigurationBifrost::configure_G76_u8 }
};
// Configurations for Mali-G7x
@@ -57,7 +60,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
{
{ DataType::F32, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f32 },
{ DataType::F16, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedKernelConfigurationBifrost::configure_G7x_u8 }
};
switch(_target)
diff --git a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
index 20fa3d65bf..b96dc96e87 100644
--- a/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped/CLGEMMReshapedKernelConfigurationValhall.cpp
@@ -49,7 +49,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedKernelConfiguratio
{
{ DataType::F32, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_f32 },
{ DataType::F16, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedKernelConfigurationValhall::configure_G77_u8 }
};
switch(_target)
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
index 8e798116bf..8826cca11b 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
@@ -50,7 +50,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
{
{ DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_f32 },
{ DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G51_u8 }
};
// Configurations for Mali-G76
@@ -58,7 +61,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
{
{ DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f32 },
{ DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G76_u8 }
};
// Configurations for Mali-G7x
@@ -66,7 +72,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
{
{ DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f32 },
{ DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedOnlyRHSKernelConfigurationBifrost::configure_G7x_u8 }
};
switch(_target)
@@ -235,15 +244,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
}
else
{
+ const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
if(m == 1)
{
- const unsigned int h0 = std::max(n / 2, 1U);
return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
}
else
{
- const unsigned int h0 = std::max(n / 4, 1U);
- return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, h0, false, true, false, true);
+ return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
}
}
}
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
index 951447e1a0..783d0fe91b 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationValhall.cpp
@@ -50,7 +50,10 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
{
{ DataType::F32, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f32 },
{ DataType::F16, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_f16 },
- { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_u8 }
+ { DataType::QASYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QSYMM8, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QASYMM8_SIGNED, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_u8 },
+ { DataType::QSYMM8_PER_CHANNEL, &CLGEMMReshapedOnlyRHSKernelConfigurationValhall::configure_G77_u8 }
};
switch(_target)
@@ -135,7 +138,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> CLGEMMReshapedOnlyRHSKernelConfi
}
else
{
- return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 4, false, true, false, true);
+ const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
+ if(m >= 28)
+ {
+ return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
+ }
+ else
+ {
+ return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, false, true, false, true);
+ }
}
}
} // namespace cl_gemm
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
deleted file mode 100644
index 0d4bbba0d4..0000000000
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
- * Copyright (c) 2017-2020 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/StringSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 && gemm_info.reinterpret_input_as_3d(), "The input1 tensor cannot have more than 2 dimensions if input0 has to be reinterpreted as 3D");
-
- const int m = gemm_info.m();
- const int n = gemm_info.n();
- const int k = gemm_info.k();
-
- ARM_COMPUTE_UNUSED(m);
- ARM_COMPUTE_UNUSED(n);
- ARM_COMPUTE_UNUSED(k);
-
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
- ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
- if(gemm_info.reinterpret_input_as_3d())
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) != static_cast<unsigned int>(m));
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
- }
-
- if(output->total_size() != 0)
- {
- const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
-{
- unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
- unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
- bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- Window win{};
- Window win_out{};
- bool window_changed = false;
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(reinterpret_input_as_3d == reinterpret_output_as_3d)
- {
- reinterpret_input_as_3d = false;
- reinterpret_output_as_3d = false;
- }
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info)).set_data_type(DataType::S32));
-
- TensorInfo tmp_info(*output);
-
- if(reinterpret_output_as_3d)
- {
- // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
- // the window needs to be constructed on the 2D collapsed version of the tensor
- TensorShape tmp_shape(output->tensor_shape());
- tmp_shape.collapse(2U, 1U);
- tmp_info.set_tensor_shape(tmp_shape);
- }
-
- // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
- // Note: if the dot product instruction is available, the 8x2 tile has to be used
- num_elems_processed_per_iteration_x = 4;
- num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
- // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
- // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
- const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2] : input0->tensor_shape()[1];
- const int bottom_pad = (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) % num_elems_processed_per_iteration_y;
-
- // Configure window
- win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
- win_out = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
- AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0), input0->dimension(1) + bottom_pad);
- AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));
- AccessWindowStatic output_access(output, 0, 0,
- ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
- output->dimension(1) + bottom_pad);
-
- window_changed = update_window_and_padding(win, input0_access, input1_access) || // window used by the execute_window_loop
- update_window_and_padding(win_out, output_access); // window used to update the padding requirements of output tensor
-
- Coordinates coord;
- coord.set_num_dimensions(output->num_dimensions());
- output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
-
- // Collapse along the Z direction
- // This collapse needs to be here in order to tune the Z dimension of LWS
- Window collapsed = win;
- const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
- collapsed = win.collapse(win, dimension_to_collapse);
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyKernel::CLGEMMLowpMatrixMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true), _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info)
-{
- configure(CLKernelLibrary::get().get_compile_context(), input0, input1, output, gemm_info);
-}
-
-void CLGEMMLowpMatrixMultiplyKernel::configure(CLCompileContext &compile_context, const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, const GEMMReshapeInfo &gemm_info)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
-
- _input0 = input0;
- _input1 = input1;
- _output = output;
- _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
- _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
- // In case both input and output have to be reinterpreted as 3D tensors,
- // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
- if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
- {
- _reinterpret_input_as_3d = false;
- _reinterpret_output_as_3d = false;
- }
-
- // Check if we need to slide the matrix B
- const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d ? _input0->info()->num_dimensions() - 1 : _input0->info()->num_dimensions();
- _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
- ElementsProcessed num_elements_processed{};
-
- // Configure kernel window
- auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), gemm_info, num_elements_processed);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- ICLKernel::configure_internal(win_config.second);
-
- // Create build options
- std::string kernel_name(" ");
- CLBuildOptions build_opts;
- build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(1)));
- build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(output->info()->dimension(2)));
- build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(input1->info()->dimension(2)));
- build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elements_processed.x()));
- build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
- build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
- build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(input0->info()->data_type()));
-
- kernel_name = "gemmlowp_mm_midgard";
-
- // Create kernel
- _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-
- // Set config_id for enabling LWS tuning
- _config_id = kernel_name;
- _config_id += "_";
- _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
- _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
- _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(1));
- _config_id += "_";
- _config_id += support::cpp11::to_string(output->info()->dimension(0));
-}
-
-Status CLGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
-{
- ElementsProcessed num_elements_processed{};
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(),
- input1->clone().get(),
- output->clone().get(),
- gemm_info,
- num_elements_processed)
- .first);
-
- return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue)
-{
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
- if(_input1->info()->num_dimensions() < 3)
- {
- // The stride_z for matrix B must be zero if we do not slice
- ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
- }
-
- Window slice = window.first_slice_window_3D();
- Window slice_matrix_b = slice;
-
- slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
- slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
- if(_reinterpret_input_as_3d)
- {
- // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
- const unsigned int total_cross_plane_pad = _input0->info()->padding().top + _input0->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- if(_reinterpret_output_as_3d)
- {
- // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
- const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
- const unsigned int total_cross_plane_pad = _output->info()->padding().top + _output->info()->padding().bottom;
- _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
- }
-
- do
- {
- Window slice_b = slice;
- // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
- // This scenario can happen when the matrix multiplication is used to perform a convolution operation
- if(!_slide_matrix_b)
- {
- slice_b = slice_matrix_b;
- }
-
- unsigned int idx = 0;
- add_2D_tensor_argument(idx, _input0, slice);
- add_2D_tensor_argument(idx, _input1, slice_b);
- add_2D_tensor_argument(idx, _output, slice);
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
- _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
- enqueue(queue, *this, slice, lws_hint());
- }
- while(window.slide_window_slice_3D(slice));
-}
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 90e5698fd8..ef17f110d0 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -35,6 +35,7 @@
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/gemm/CLGEMMKernelSelection.h"
namespace arm_compute
{
@@ -43,16 +44,33 @@ using namespace arm_compute::cl_gemm;
namespace
{
-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
+inline bool is_gemm_reshaped(unsigned int m, unsigned int n, unsigned int k, DataType data_type, bool reshape_b_only_on_first_run)
{
- return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
+ std::unique_ptr<ICLGEMMKernelSelection> gemm_kernel = CLGEMMKernelSelectionFactory::create(CLScheduler::get().target());
+ ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_kernel.get());
+
+ CLGEMMKernelSelectionParams params;
+ params.m = m;
+ params.n = n;
+ params.k = k;
+ params.is_rhs_constant = reshape_b_only_on_first_run;
+ params.data_type = data_type;
+
+ switch(gemm_kernel->select_kernel(params))
+ {
+ case CLGEMMKernelType::NATIVE:
+ return false;
+ case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+ return true;
+ default:
+ ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
+ }
}
} // namespace
CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(std::move(memory_manager)),
_weights_to_qasymm8(),
- _mm_midgard_kernel(),
_mm_native_kernel(),
_mm_reshaped_only_rhs_kernel(),
_mtx_b_reshape_kernel(),
@@ -73,7 +91,6 @@ CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemo
_a_offset(0),
_b_offset(0),
_is_gemm_reshaped(true),
- _is_midgard(false),
_reshape_b_only_on_first_run(false),
_is_prepared(false),
_run_output_stage(false),
@@ -102,7 +119,6 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
const GPUTarget gpu_target = CLScheduler::get().target();
// Set the target for the kernels
- _mm_midgard_kernel.set_target(gpu_target);
_mm_native_kernel.set_target(gpu_target);
_mm_reshaped_only_rhs_kernel.set_target(gpu_target);
@@ -120,8 +136,7 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
// Check if we need to reshape the matrix A and matrix B
- _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
- _is_midgard = gpu_target == GPUTarget::MIDGARD;
+ _is_gemm_reshaped = is_gemm_reshaped(m, n, k, a->info()->data_type(), _reshape_b_only_on_first_run);
if(_convert_to_qasymm8)
{
@@ -220,19 +235,12 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
}
else
{
- if(_is_midgard)
- {
- // Configure matrix multiply kernel
- _mm_midgard_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
- // Configure matrix multiply kernel
- _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(_matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+
_offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
_a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
@@ -260,19 +268,11 @@ void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor
}
else
{
- if(_is_midgard)
- {
- // Configure matrix multiply kernel
- _mm_midgard_kernel.configure(_matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Configure matrix multiply kernel
- _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
- }
+ // Configure matrix multiply kernel
+ _mm_native_kernel.configure(_matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
}
// Configure offset contribution kernel
@@ -329,9 +329,8 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
const unsigned int k = a->dimension(0);
const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
- const bool is_midgard = gpu_target == GPUTarget::MIDGARD;
- bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
+ bool reshape_matrix_b = is_gemm_reshaped(m, n, k, a->data_type(), gemm_info.reshape_b_only_on_first_run());
const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
@@ -425,19 +424,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
// Output tensor auto inizialitation if not yet initialized
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
- if(is_midgard)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
-
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
- }
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
}
// Validate offset contribution kernel
@@ -461,19 +452,11 @@ Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
}
else
{
- if(is_midgard)
- {
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
- }
- else
- {
- // Pick up the GEMM configuration
- std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+ // Pick up the GEMM configuration
+ std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
- // Validate matrix multiply
- ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
- }
+ // Validate matrix multiply
+ ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
}
if(output->total_size() != 0)
@@ -524,14 +507,7 @@ void CLGEMMLowpMatrixMultiplyCore::run()
}
else
{
- if(_is_midgard)
- {
- CLScheduler::get().enqueue(_mm_midgard_kernel, false);
- }
- else
- {
- CLScheduler::get().enqueue(_mm_native_kernel, false);
- }
+ CLScheduler::get().enqueue(_mm_native_kernel, false);
}
if(_run_output_stage)
{
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
index d30eaa9edc..041e7d6cb4 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionBifrost.cpp
@@ -165,27 +165,15 @@ CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_f16(unsigned int m, unsig
CLGEMMKernelType CLGEMMKernelSelectionBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
{
+ ARM_COMPUTE_UNUSED(m, n, k);
+
if(is_rhs_constant)
{
- if(m == 1)
- {
- if((n > k) && gpu_target_is_in(_target, GPUTarget::G71))
- {
- return CLGEMMKernelType::NATIVE_V1;
- }
- else
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- }
- else
- {
- return CLGEMMKernelType::RESHAPED;
- }
+ return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
}
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
index b7bb720175..a94a392553 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionMidgard.cpp
@@ -86,10 +86,9 @@ CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_f16(unsigned int m, unsig
CLGEMMKernelType CLGEMMKernelSelectionMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
{
- ARM_COMPUTE_UNUSED(n, k);
+ ARM_COMPUTE_UNUSED(m, n, k, is_rhs_constant);
- // We reshape the matrices only if we do not have the vector-by-matrix case and we reshape the matrix B only once
- return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED_V1 : CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
} // namespace cl_gemm
} // namespace arm_compute
diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
index 8016417eb9..775bb9bffd 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelectionValhall.cpp
@@ -83,22 +83,15 @@ CLGEMMKernelType CLGEMMKernelSelectionValhall::default_f16(unsigned int m, unsig
CLGEMMKernelType CLGEMMKernelSelectionValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, bool is_rhs_constant)
{
- ARM_COMPUTE_UNUSED(n, k);
+ ARM_COMPUTE_UNUSED(m, n, k);
if(is_rhs_constant)
{
- if(m == 1)
- {
- return CLGEMMKernelType::RESHAPED_ONLY_RHS;
- }
- else
- {
- return CLGEMMKernelType::RESHAPED;
- }
+ return CLGEMMKernelType::RESHAPED_ONLY_RHS;
}
else
{
- return CLGEMMKernelType::NATIVE_V1;
+ return CLGEMMKernelType::NATIVE;
}
}
} // namespace cl_gemm