From a50f19346c5b79e2743f882ce0c691c07076f207 Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Mon, 8 Mar 2021 17:27:05 +0000 Subject: Updated cpu detection * Added the case in the cpu detection code for Klein cores * Added has_sve() and set_sve() methods in CpuInfo * Detection code checks for presence of SVE via HWCAP_SVE * Updated the heuristic in sve kernels to check for the absence of Klein * Resolves: COMPMID-4085 Change-Id: I0b8c72ff19dc5a3a81628d121a1afa836e724b4f Signed-off-by: Pablo Marquez Tello Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5257 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/CPP/CPPTypes.h | 18 ++++++- scripts/clang_tidy_rules.py | 1 + src/core/CPP/CPPTypes.cpp | 11 +++++ src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp | 12 ++--- src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 8 ++-- src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 24 +++++----- src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 16 +++---- src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp | 24 +++++----- src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp | 22 ++++----- src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 16 +++---- src/runtime/CPUUtils.cpp | 65 +++++++++++++++++++++++--- 11 files changed, 148 insertions(+), 69 deletions(-) diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index fd6bfc3907..2de73acaa2 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,7 @@ enum class CPUModel A53, A55r0, A55r1, + KLEIN, X1, A73 }; @@ -76,6 +77,10 @@ inline std::string cpu_model_to_string(CPUModel val) { return std::string("GENERIC"); } + case CPUModel::KLEIN: + { + return std::string("KLEIN"); + } case CPUModel::GENERIC_FP16: { return std::string("GENERIC_FP16"); @@ -136,6 +141,11 @@ public: * @return true of the cpu supports dot product, false otherwise */ bool has_dotprod() const; + /** Checks if the cpu model supports sve. + * + * @return true of the cpu supports sve, false otherwise + */ + bool has_sve() const; /** Gets the cpu model for a given cpuid. * * @param[in] cpuid the id of the cpu core to be retrieved, @@ -178,6 +188,11 @@ public: * @param[in] dotprod whether the cpu supports dot product. */ void set_dotprod(const bool dotprod); + /** Set sve support + * + * @param[in] sve whether the cpu supports sve. + */ + void set_sve(const bool sve); /** Set the cpumodel for a given cpu core * * @param[in] cpuid the id of the core to be set. @@ -200,6 +215,7 @@ private: std::vector _percpu = {}; bool _fp16 = false; bool _dotprod = false; + bool _sve = false; unsigned int _L1_cache_size = 32768; unsigned int _L2_cache_size = 262144; }; diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py index 0ccf0b2910..2e72c824f0 100755 --- a/scripts/clang_tidy_rules.py +++ b/scripts/clang_tidy_rules.py @@ -66,6 +66,7 @@ def filter_clang_tidy_lines( lines ): ("Utils.h" in line and "no member named 'unmap' in 'arm_compute::Tensor'" in line) or ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or ("CPUUtils.cpp" in line and "'asm/hwcap.h' file not found" in line) or + ("CPUUtils.cpp" in line and "use of undeclared identifier 'HWCAP_SVE'" in line) or ("'arm_compute_version.embed' file not found" in line) ): print_context=False continue diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp index 139e106ca6..0850df29fd 100644 --- a/src/core/CPP/CPPTypes.cpp +++ b/src/core/CPP/CPPTypes.cpp @@ -42,6 +42,11 @@ void CPUInfo::set_dotprod(const bool dotprod) _dotprod = dotprod; } +void CPUInfo::set_sve(const bool sve) +{ + _sve = sve; +} + void CPUInfo::set_cpu_model(unsigned int cpuid, CPUModel model) { ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size()); @@ -55,6 +60,12 @@ unsigned int CPUInfo::get_cpu_num() const { return _percpu.size(); } + +bool CPUInfo::has_sve() const +{ + return _sve; +} + bool CPUInfo::has_fp16() const { return _fp16; diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp index 96b9734221..d8134c4bb5 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp @@ -49,22 +49,22 @@ static const GemmImplementation gemm_bf16_methods[] = { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_bf16fp32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_bf16fp32_dot_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && ((args._Ksize <= 128) && (args._Nsize <= 128)); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_bf16fp32_dot_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>2); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>2); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, # endif // SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp index 93563a63d0..8e355c8f2c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -47,15 +47,15 @@ static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = { { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp16_mla_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp16_mla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize > 4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize > 4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index b0e912d188..5c894c01c8 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -62,8 +62,8 @@ static const GemmImplementation gemm_fp32_methods[] = { GemmMethod::GEMM_HYBRID, "sve_gemv_fp32_mla_8VL", - [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemvPretransposed(args); } }, #endif @@ -80,8 +80,8 @@ static const GemmImplementation gemm_fp32_methods[] = { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // __ARM_FEATURE_SVE && MMLA_FP32 @@ -91,22 +91,22 @@ static const GemmImplementation gemm_fp32_methods[] = { GemmMethod::GEMM_HYBRID, "sve_smallK_hybrid_fp32_mla_8x1VL", - [](const GemmArgs &args) { return args._Ksize <= 24 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize <= 24 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp32_mla_8x1VL", - nullptr, - [](const GemmArgs &args) { return (args._Nsize < 12); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (args._Nsize < 12); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp32_mla_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, #endif // __ARM_FEATURE_SVE @@ -144,8 +144,8 @@ GemmImplementation::with_estimate( { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp32_mla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index f081558c40..60cf82f9c6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -51,30 +51,30 @@ static const GemmImplementation gemm_s8_methods[] = { { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>8); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif { GemmMethod::GEMM_HYBRID, "sve_smallK_hybrid_s8s32_dot_8x1VL", - [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8s32_dot_6x4VL", - [](const GemmArgs &args) { return args._Ksize>=16; }, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize>=16; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_dot_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif // SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp index d3a55eba6b..094b6fdff4 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp @@ -58,46 +58,46 @@ static const GemmImplementation gemm_qint8_methods { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_mmla_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>8); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID_QUANTIZED, "sve_smallK_hybrid_s8s32_dot_8x1VL", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, #ifdef SVE2 { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8qs_dot_6x4VL", - [](const GemmArgs &, const Requantize32 &qp) { return quant_hybrid_symmetric(qp); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_symmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8qa_dot_4x4VL", - [](const GemmArgs &, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8s32_dot_6x4VL", - nullptr, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_dot_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, #endif // SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp index 9720c7d06e..be27b3a117 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp @@ -55,39 +55,39 @@ static const GemmImplementation gemm_quint8_meth { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_mmla_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>8); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID_QUANTIZED, "sve_smallK_hybrid_u8u32_dot_8x1VL", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized(args, qp); } }, #ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL) { GemmMethod::GEMM_HYBRID, "sve_hybrid_u8qa_dot_4x4VL", - [](const GemmArgs &, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID, "sve_hybrid_u8u32_dot_6x4VL", - nullptr, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect(args, qp); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_dot_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, #endif @@ -96,7 +96,7 @@ static const GemmImplementation gemm_quint8_meth GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_u8u32_mmla_8x12", [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized(args, qp); } }, #endif diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index d2ebe00f3b..fb41a9fc09 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -51,30 +51,30 @@ static const GemmImplementation gemm_u8_methods[] = { { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args) { args._ci->has_sve() && return (args._Ksize>8); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_u8u32_dot_8x1VL", - [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmHybrid(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_u8u32_dot_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect(args); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_dot_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved(args); } }, #endif diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp index 63c9a8639c..82b42336e6 100644 --- a/src/runtime/CPUUtils.cpp +++ b/src/runtime/CPUUtils.cpp @@ -62,12 +62,27 @@ #define HWCAP_ASIMDDP (1 << 20) // NOLINT #endif /* HWCAP_ASIMDDP */ +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) // NOLINT +#endif /* HWCAP_SVE */ + namespace { using namespace arm_compute; #if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) +bool model_supports_sve(CPUModel model) +{ + switch(model) + { + case CPUModel::KLEIN: + return true; + default: + return false; + } +} + bool model_supports_dot(CPUModel model) { switch(model) @@ -75,6 +90,7 @@ bool model_supports_dot(CPUModel model) case CPUModel::GENERIC_FP16_DOT: case CPUModel::A55r1: case CPUModel::X1: + case CPUModel::KLEIN: return true; default: return false; @@ -89,6 +105,7 @@ bool model_supports_fp16(CPUModel model) case CPUModel::GENERIC_FP16_DOT: case CPUModel::A55r1: case CPUModel::X1: + case CPUModel::KLEIN: return true; default: return false; @@ -146,6 +163,9 @@ CPUModel midr_to_model(const unsigned int midr) case 0xd0d: model = CPUModel::GENERIC_FP16_DOT; break; + case 0xd46: + model = CPUModel::KLEIN; + break; default: model = CPUModel::GENERIC; break; @@ -369,11 +389,11 @@ namespace cpu void get_cpu_configuration(CPUInfo &cpuinfo) { #if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) - bool cpuid = false; - bool hwcaps_fp16_support = false; - bool hwcaps_dot_support = false; - - const uint32_t hwcaps = getauxval(AT_HWCAP); + bool cpuid = false; + bool hwcaps_fp16_support = false; + bool hwcaps_dot_support = false; + bool hwcaps_sve = false; + const uint32_t hwcaps = getauxval(AT_HWCAP); if((hwcaps & HWCAP_CPUID) != 0) { @@ -390,6 +410,11 @@ void get_cpu_configuration(CPUInfo &cpuinfo) { hwcaps_dot_support = true; } + + if((hwcaps & HWCAP_SVE) != 0) + { + hwcaps_sve = true; + } #endif /* defined(__aarch64__) */ const unsigned int max_cpus = get_max_cpus(); @@ -408,17 +433,43 @@ void get_cpu_configuration(CPUInfo &cpuinfo) // We assume that the system does not have mixed architectures bool one_supports_dot = false; bool one_supports_fp16 = false; + bool one_supports_sve = false; for(const auto &v : percpu) { one_supports_dot = one_supports_dot || model_supports_dot(v); one_supports_fp16 = one_supports_fp16 || model_supports_fp16(v); + one_supports_sve = one_supports_sve || model_supports_sve(v); cpuinfo.set_cpu_model(j++, v); } cpuinfo.set_dotprod(one_supports_dot || hwcaps_dot_support); cpuinfo.set_fp16(one_supports_fp16 || hwcaps_fp16_support); -#else /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ + cpuinfo.set_sve(one_supports_sve || hwcaps_sve); +#elif(BARE_METAL) && defined(__aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ + cpuinfo.set_cpu_num(1); + const CPUModel cpumodel{ CPUModel::GENERIC }; + cpuinfo.set_cpu_model(0, cpumodel); + // Assume single CPU in bare metal mode. Just read the ID register and feature bits directly. + uint64_t fr0, pfr0, midr; + __asm __volatile( + "MRS %0, ID_AA64ISAR0_EL1\n" + "MRS %1, ID_AA64PFR0_EL1\n" + "MRS %2, midr_el1" + : "=r"(fr0), "=r"(pfr0), "=r"(midr)); + if((fr0 >> 44) & 0xf) + { + cpuinfo.set_dotprod(true); + } + if((pfr0 >> 16) & 0xf) + { + cpuinfo.set_fp16(true); + } + if((pfr0 >> 32) & 0xf) + { + cpuinfo.set_sve(true); + } +#else /* #elif(BARE_METAL) && defined(__aarch64__) */ ARM_COMPUTE_UNUSED(cpuinfo); -#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ +#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ } unsigned int get_threads_hint() -- cgit v1.2.1