aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGunes Bayir <gunes.bayir@arm.com>2023-10-10 17:41:56 +0100
committerGunes Bayir <gunes.bayir@arm.com>2023-10-13 15:39:49 +0000
commitc1204c76d40dcaf754fd7d725c432f19a2f368a4 (patch)
tree50c5213b18af21fac2ce32157edc51262e6ff6fa
parentd8a397e40e59a6c1251a774966360240fd172fca (diff)
downloadComputeLibrary-c1204c76d40dcaf754fd7d725c432f19a2f368a4.tar.gz
Connect MatMul MMUL kernels to ClMatMul operator
Resolves: COMPMID-6478 Change-Id: I5bc220c3bd00a316776fe14454438cc0dc9049b3 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10469 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--src/gpu/cl/operators/ClFullyConnected.h8
-rw-r--r--src/gpu/cl/operators/ClMatMul.cpp146
-rw-r--r--src/gpu/cl/operators/ClMatMul.h14
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp28
-rw-r--r--src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h12
5 files changed, 159 insertions, 49 deletions
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index 0621238ab5..72884ff7ad 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -21,14 +21,16 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H
-#define ARM_COMPUTE_CL_FULLY_CONNECTED_H
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
#include "src/gpu/cl/ClCompileContext.h"
#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include <memory>
@@ -174,4 +176,4 @@ private:
};
} // namespace opencl
} // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index c14b1f2992..9962ee550a 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -28,7 +28,10 @@
#include "arm_compute/runtime/CL/CLScheduler.h"
#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
@@ -39,11 +42,62 @@ namespace arm_compute
{
namespace opencl
{
+namespace
+{
+enum class MatMulKernelType
+{
+ /** Native matrix multiplication for FP types */
+ NATIVE_FP,
+
+ /** Native matrix multiplication for quantized types */
+ NATIVE_QUANTIZED,
+
+ /** Native matrix multiplication using MMUL extension for FP types */
+ NATIVE_MMUL_FP,
+
+ /** Native matrix multiplication using MMUL extension for Quantized types */
+ NATIVE_MMUL_QUANTIZED
+};
+
+MatMulKernelType get_matmul_kernel(const ITensorInfo *lhs,
+ const ITensorInfo *rhs,
+ const MatMulInfo &matmul_info,
+ const ActivationLayerInfo &act_info)
+{
+ ARM_COMPUTE_UNUSED(lhs, rhs, matmul_info, act_info);
+
+ const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
+ const bool is_mmul_supported = arm_matrix_multiply_supported(CLKernelLibrary::get().get_device());
+
+ const int k = matmul_info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+ if (is_quantized)
+ {
+ // MMUL kernel works only when K is a multiple of 16
+ if (is_mmul_supported && !act_info.enabled() && k % 16 == 0)
+ {
+ return MatMulKernelType::NATIVE_MMUL_QUANTIZED;
+ }
+
+ return MatMulKernelType::NATIVE_QUANTIZED;
+ }
+ else
+ {
+ // MMUL kernel works only when K is a multiple of 4
+ if (is_mmul_supported && !act_info.enabled() && k % 4 == 0)
+ {
+ return MatMulKernelType::NATIVE_MMUL_FP;
+ }
+
+ return MatMulKernelType::NATIVE_FP;
+ }
+
+ return is_quantized ? MatMulKernelType::NATIVE_QUANTIZED : MatMulKernelType::NATIVE_FP;
+}
+} // namespace
using namespace arm_compute::opencl::kernels;
ClMatMul::ClMatMul()
- : _matmul_native_kernel(std::make_unique<ClMatMulNativeKernel>()),
- _matmul_lowp_native_kernel(std::make_unique<ClMatMulLowpNativeKernel>())
{
}
@@ -65,10 +119,19 @@ Status ClMatMul::validate(const ITensorInfo *lhs,
const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
- const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
-
- return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info)
- : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info))
+ {
+ case MatMulKernelType::NATIVE_FP:
+ return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ case MatMulKernelType::NATIVE_MMUL_FP:
+ return ClMatMulNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info);
+ case MatMulKernelType::NATIVE_QUANTIZED:
+ return ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+ return ClMatMulLowpNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ default:
+ ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
+ }
}
void ClMatMul::configure(const CLCompileContext &compile_context,
@@ -84,41 +147,56 @@ void ClMatMul::configure(const CLCompileContext &compile_context,
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info));
- _is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
-
- const GPUTarget gpu_target = CLScheduler::get().target();
-
- std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-
- MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
-
- if (_is_quantized)
- {
- _matmul_lowp_native_kernel->set_target(gpu_target);
+ const GPUTarget gpu_target = CLScheduler::get().target();
+ const auto kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+ const MatMulKernelInfo kernel_info = kernel_config->configure(lhs, rhs, matmul_info);
- // Configure the low-precision native matrix multiply kernel
- _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info,
- act_info);
- }
- else
+ switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info))
{
- _matmul_native_kernel->set_target(gpu_target);
-
- // Configure the native matrix multiply kernel
- _matmul_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ case MatMulKernelType::NATIVE_FP:
+ {
+ auto kernel = std::make_unique<ClMatMulNativeKernel>();
+ kernel->set_target(gpu_target);
+
+ kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ _matmul_kernel = std::move(kernel);
+ }
+ break;
+ case MatMulKernelType::NATIVE_MMUL_FP:
+ {
+ auto kernel = std::make_unique<ClMatMulNativeMMULKernel>();
+ kernel->set_target(gpu_target);
+
+ kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info);
+ _matmul_kernel = std::move(kernel);
+ }
+ break;
+ case MatMulKernelType::NATIVE_QUANTIZED:
+ {
+ auto kernel = std::make_unique<ClMatMulLowpNativeKernel>();
+ kernel->set_target(gpu_target);
+
+ kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ _matmul_kernel = std::move(kernel);
+ }
+ break;
+ case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+ {
+ auto kernel = std::make_unique<ClMatMulLowpNativeMMULKernel>();
+ kernel->set_target(gpu_target);
+
+ kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+ _matmul_kernel = std::move(kernel);
+ }
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
}
}
void ClMatMul::run(ITensorPack &tensors)
{
- if (_is_quantized)
- {
- CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
- }
- else
- {
- CLScheduler::get().enqueue_op(*_matmul_native_kernel, tensors, true);
- }
+ CLScheduler::get().enqueue_op(*_matmul_kernel, tensors, /* flush */ true);
}
} // namespace opencl
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index 64dcf217bd..1733def21c 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -21,15 +21,14 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
-#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
#include "arm_compute/function_info/ActivationLayerInfo.h"
#include "arm_compute/function_info/MatMulInfo.h"
+#include "src/gpu/cl/IClKernel.h"
#include "src/gpu/cl/IClOperator.h"
-#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
#include <memory>
@@ -95,11 +94,8 @@ public:
void run(ITensorPack &tensors) override;
private:
- std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{nullptr};
- std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
-
- bool _is_quantized{false};
+ std::unique_ptr<opencl::IClKernel> _matmul_kernel{nullptr};
};
} // namespace opencl
} // namespace arm_compute
-#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
index b3c8d891dc..6b641413ce 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
@@ -53,9 +53,17 @@ ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITen
&ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
&ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+ ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+ &ClMatMulNativeDefaultConfigValhall::configure_G715_f32,
+ &ClMatMulNativeDefaultConfigValhall::configure_G715_f16,
+ &ClMatMulNativeDefaultConfigValhall::configure_G715_u8);
+
ConfigurationFunctionExecutorPtr func = nullptr;
switch (_target)
{
+ case GPUTarget::G715:
+ func = configs_G715.get_function(lhs->data_type());
+ break;
case GPUTarget::G710:
default:
func = configs_G710.get_function(lhs->data_type());
@@ -84,6 +92,26 @@ ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITen
return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
}
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+ return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false};
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ return configure_G715_f32(m, n, k, b, rhs_lock_padding, info);
+}
+
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+{
+ ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding);
+ return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false};
+}
+
MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
{
diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
index 6b39db6a3f..5279871057 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
@@ -21,8 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL
-#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL
+#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
+#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H
#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
@@ -50,7 +50,13 @@ private:
unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
MatMulKernelInfo configure_G710_u8(
unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G715_f32(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G715_f16(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+ MatMulKernelInfo configure_G715_u8(
+ unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
};
} // namespace cl_matmul
} // namespace arm_compute
-#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */
+#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H