From c1204c76d40dcaf754fd7d725c432f19a2f368a4 Mon Sep 17 00:00:00 2001 From: Gunes Bayir Date: Tue, 10 Oct 2023 17:41:56 +0100 Subject: Connect MatMul MMUL kernels to ClMatMul operator Resolves: COMPMID-6478 Change-Id: I5bc220c3bd00a316776fe14454438cc0dc9049b3 Signed-off-by: Gunes Bayir Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10469 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Benchmark: Arm Jenkins --- src/gpu/cl/operators/ClFullyConnected.h | 8 +- src/gpu/cl/operators/ClMatMul.cpp | 146 ++++++++++++++++----- src/gpu/cl/operators/ClMatMul.h | 14 +- .../ClMatMulNativeDefaultConfigValhall.cpp | 28 ++++ .../ClMatMulNativeDefaultConfigValhall.h | 12 +- 5 files changed, 159 insertions(+), 49 deletions(-) diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h index 0621238ab5..72884ff7ad 100644 --- a/src/gpu/cl/operators/ClFullyConnected.h +++ b/src/gpu/cl/operators/ClFullyConnected.h @@ -21,14 +21,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H -#define ARM_COMPUTE_CL_FULLY_CONNECTED_H +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H +#define ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H #include "arm_compute/core/TensorInfo.h" #include "arm_compute/function_info/FullyConnectedLayerInfo.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include @@ -174,4 +176,4 @@ private: }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */ +#endif // ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index c14b1f2992..9962ee550a 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -28,7 +28,10 @@ #include "arm_compute/runtime/CL/CLScheduler.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h" #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" +#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h" #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" @@ -39,11 +42,62 @@ namespace arm_compute { namespace opencl { +namespace +{ +enum class MatMulKernelType +{ + /** Native matrix multiplication for FP types */ + NATIVE_FP, + + /** Native matrix multiplication for quantized types */ + NATIVE_QUANTIZED, + + /** Native matrix multiplication using MMUL extension for FP types */ + NATIVE_MMUL_FP, + + /** Native matrix multiplication using MMUL extension for Quantized types */ + NATIVE_MMUL_QUANTIZED +}; + +MatMulKernelType get_matmul_kernel(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(lhs, rhs, matmul_info, act_info); + + const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); + const bool is_mmul_supported = arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()); + + const int k = matmul_info.adj_lhs() ? lhs->tensor_shape().y() : lhs->tensor_shape().x(); + + if (is_quantized) + { + // MMUL kernel works only when K is a multiple of 16 + if (is_mmul_supported && !act_info.enabled() && k % 16 == 0) + { + return MatMulKernelType::NATIVE_MMUL_QUANTIZED; + } + + return MatMulKernelType::NATIVE_QUANTIZED; + } + else + { + // MMUL kernel works only when K is a multiple of 4 + if (is_mmul_supported && !act_info.enabled() && k % 4 == 0) + { + return MatMulKernelType::NATIVE_MMUL_FP; + } + + return MatMulKernelType::NATIVE_FP; + } + + return is_quantized ? MatMulKernelType::NATIVE_QUANTIZED : MatMulKernelType::NATIVE_FP; +} +} // namespace using namespace arm_compute::opencl::kernels; ClMatMul::ClMatMul() - : _matmul_native_kernel(std::make_unique()), - _matmul_lowp_native_kernel(std::make_unique()) { } @@ -65,10 +119,19 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - - return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) - : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info)) + { + case MatMulKernelType::NATIVE_FP: + return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + case MatMulKernelType::NATIVE_MMUL_FP: + return ClMatMulNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info); + case MatMulKernelType::NATIVE_QUANTIZED: + return ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + case MatMulKernelType::NATIVE_MMUL_QUANTIZED: + return ClMatMulLowpNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + default: + ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!"); + } } void ClMatMul::configure(const CLCompileContext &compile_context, @@ -84,41 +147,56 @@ void ClMatMul::configure(const CLCompileContext &compile_context, // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info)); - _is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - - const GPUTarget gpu_target = CLScheduler::get().target(); - - std::unique_ptr t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - - MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - - if (_is_quantized) - { - _matmul_lowp_native_kernel->set_target(gpu_target); + const GPUTarget gpu_target = CLScheduler::get().target(); + const auto kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + const MatMulKernelInfo kernel_info = kernel_config->configure(lhs, rhs, matmul_info); - // Configure the low-precision native matrix multiply kernel - _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, - act_info); - } - else + switch (get_matmul_kernel(lhs, rhs, matmul_info, act_info)) { - _matmul_native_kernel->set_target(gpu_target); - - // Configure the native matrix multiply kernel - _matmul_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + case MatMulKernelType::NATIVE_FP: + { + auto kernel = std::make_unique(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_kernel = std::move(kernel); + } + break; + case MatMulKernelType::NATIVE_MMUL_FP: + { + auto kernel = std::make_unique(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info); + _matmul_kernel = std::move(kernel); + } + break; + case MatMulKernelType::NATIVE_QUANTIZED: + { + auto kernel = std::make_unique(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_kernel = std::move(kernel); + } + break; + case MatMulKernelType::NATIVE_MMUL_QUANTIZED: + { + auto kernel = std::make_unique(); + kernel->set_target(gpu_target); + + kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_kernel = std::move(kernel); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!"); } } void ClMatMul::run(ITensorPack &tensors) { - if (_is_quantized) - { - CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true); - } - else - { - CLScheduler::get().enqueue_op(*_matmul_native_kernel, tensors, true); - } + CLScheduler::get().enqueue_op(*_matmul_kernel, tensors, /* flush */ true); } } // namespace opencl diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index 64dcf217bd..1733def21c 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -21,15 +21,14 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL -#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL +#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H +#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/function_info/MatMulInfo.h" +#include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" -#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" -#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" #include @@ -95,11 +94,8 @@ public: void run(ITensorPack &tensors) override; private: - std::unique_ptr _matmul_native_kernel{nullptr}; - std::unique_ptr _matmul_lowp_native_kernel{nullptr}; - - bool _is_quantized{false}; + std::unique_ptr _matmul_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ACL_SRC_GPU_CL_OPERATORS_CLMATMUL */ +#endif // ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp index b3c8d891dc..6b641413ce 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp @@ -53,9 +53,17 @@ ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITen &ClMatMulNativeDefaultConfigValhall::configure_G710_f16, &ClMatMulNativeDefaultConfigValhall::configure_G710_u8); + ClMatMulNativeConfigArray configs_G715( + &ClMatMulNativeDefaultConfigValhall::configure_G715_f32, + &ClMatMulNativeDefaultConfigValhall::configure_G715_f16, + &ClMatMulNativeDefaultConfigValhall::configure_G715_u8); + ConfigurationFunctionExecutorPtr func = nullptr; switch (_target) { + case GPUTarget::G715: + func = configs_G715.get_function(lhs->data_type()); + break; case GPUTarget::G710: default: func = configs_G710.get_function(lhs->data_type()); @@ -84,6 +92,26 @@ ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITen return (this->*func)(m, n, k, b, rhs->lock_paddings(), info); } +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 4, /* k0 */ 1, /* export_to_cl_image */ false}; +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + return configure_G715_f32(m, n, k, b, rhs_lock_padding, info); +} + +MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) +{ + ARM_COMPUTE_UNUSED(m, n, k, b, rhs_lock_padding); + return {info.adj_lhs(), info.adj_rhs(), /* m0 */ 1, /* n0 */ 16, /* k0 */ 4, /* export_to_cl_image */ false}; +} + MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32( unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info) { diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h index 6b39db6a3f..5279871057 100644 --- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h +++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ -#ifndef SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL -#define SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL +#ifndef ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H +#define ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" @@ -50,7 +50,13 @@ private: unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); MatMulKernelInfo configure_G710_u8( unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); + MatMulKernelInfo configure_G715_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info); }; } // namespace cl_matmul } // namespace arm_compute -#endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */ +#endif // ACL_SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL_H -- cgit v1.2.1