diff options
author | Pablo Marquez Tello <pablo.tello@arm.com> | 2021-03-08 17:27:05 +0000 |
---|---|---|
committer | Pablo Marquez Tello <pablo.tello@arm.com> | 2021-03-17 12:45:26 +0000 |
commit | a50f19346c5b79e2743f882ce0c691c07076f207 (patch) | |
tree | 40141711eae786bc65738f04baa4e17cd6a20d97 | |
parent | d0c9cb808f674ce8bbfbdf0e66c5b8451f6af0f2 (diff) | |
download | ComputeLibrary-a50f19346c5b79e2743f882ce0c691c07076f207.tar.gz |
Updated cpu detection
* Added the case in the cpu detection code for Klein cores
* Added has_sve() and set_sve() methods in CpuInfo
* Detection code checks for presence of SVE via HWCAP_SVE
* Updated the heuristic in sve kernels to check for the absence of Klein
* Resolves: COMPMID-4085
Change-Id: I0b8c72ff19dc5a3a81628d121a1afa836e724b4f
Signed-off-by: Pablo Marquez Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5257
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | arm_compute/core/CPP/CPPTypes.h | 18 | ||||
-rwxr-xr-x | scripts/clang_tidy_rules.py | 1 | ||||
-rw-r--r-- | src/core/CPP/CPPTypes.cpp | 11 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp | 12 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp | 8 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp | 24 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_int8.cpp | 16 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp | 24 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp | 22 | ||||
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp | 16 | ||||
-rw-r--r-- | src/runtime/CPUUtils.cpp | 65 |
11 files changed, 148 insertions, 69 deletions
diff --git a/arm_compute/core/CPP/CPPTypes.h b/arm_compute/core/CPP/CPPTypes.h index fd6bfc3907..2de73acaa2 100644 --- a/arm_compute/core/CPP/CPPTypes.h +++ b/arm_compute/core/CPP/CPPTypes.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -45,6 +45,7 @@ enum class CPUModel A53, A55r0, A55r1, + KLEIN, X1, A73 }; @@ -76,6 +77,10 @@ inline std::string cpu_model_to_string(CPUModel val) { return std::string("GENERIC"); } + case CPUModel::KLEIN: + { + return std::string("KLEIN"); + } case CPUModel::GENERIC_FP16: { return std::string("GENERIC_FP16"); @@ -136,6 +141,11 @@ public: * @return true of the cpu supports dot product, false otherwise */ bool has_dotprod() const; + /** Checks if the cpu model supports sve. + * + * @return true of the cpu supports sve, false otherwise + */ + bool has_sve() const; /** Gets the cpu model for a given cpuid. * * @param[in] cpuid the id of the cpu core to be retrieved, @@ -178,6 +188,11 @@ public: * @param[in] dotprod whether the cpu supports dot product. */ void set_dotprod(const bool dotprod); + /** Set sve support + * + * @param[in] sve whether the cpu supports sve. + */ + void set_sve(const bool sve); /** Set the cpumodel for a given cpu core * * @param[in] cpuid the id of the core to be set. @@ -200,6 +215,7 @@ private: std::vector<CPUModel> _percpu = {}; bool _fp16 = false; bool _dotprod = false; + bool _sve = false; unsigned int _L1_cache_size = 32768; unsigned int _L2_cache_size = 262144; }; diff --git a/scripts/clang_tidy_rules.py b/scripts/clang_tidy_rules.py index 0ccf0b2910..2e72c824f0 100755 --- a/scripts/clang_tidy_rules.py +++ b/scripts/clang_tidy_rules.py @@ -66,6 +66,7 @@ def filter_clang_tidy_lines( lines ): ("Utils.h" in line and "no member named 'unmap' in 'arm_compute::Tensor'" in line) or ("Utils.h" in line and "no member named 'map' in 'arm_compute::Tensor'" in line) or ("CPUUtils.cpp" in line and "'asm/hwcap.h' file not found" in line) or + ("CPUUtils.cpp" in line and "use of undeclared identifier 'HWCAP_SVE'" in line) or ("'arm_compute_version.embed' file not found" in line) ): print_context=False continue diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp index 139e106ca6..0850df29fd 100644 --- a/src/core/CPP/CPPTypes.cpp +++ b/src/core/CPP/CPPTypes.cpp @@ -42,6 +42,11 @@ void CPUInfo::set_dotprod(const bool dotprod) _dotprod = dotprod; } +void CPUInfo::set_sve(const bool sve) +{ + _sve = sve; +} + void CPUInfo::set_cpu_model(unsigned int cpuid, CPUModel model) { ARM_COMPUTE_ERROR_ON(cpuid >= _percpu.size()); @@ -55,6 +60,12 @@ unsigned int CPUInfo::get_cpu_num() const { return _percpu.size(); } + +bool CPUInfo::has_sve() const +{ + return _sve; +} + bool CPUInfo::has_fp16() const { return _fp16; diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp index 96b9734221..d8134c4bb5 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp @@ -49,22 +49,22 @@ static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] = { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_bf16fp32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_bf16fp32_dot_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && ((args._Ksize <= 128) && (args._Nsize <= 128)); }, [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); } }, { // gemm_bf16_interleaved GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_bf16fp32_dot_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>2); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>2); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); } }, # endif // SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp index 93563a63d0..8e355c8f2c 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp @@ -47,15 +47,15 @@ static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = { { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp16_mla_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp16_mla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize > 4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize > 4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); } }, #endif diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp index b0e912d188..5c894c01c8 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp @@ -62,8 +62,8 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] = { GemmMethod::GEMM_HYBRID, "sve_gemv_fp32_mla_8VL", - [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Msize==1 && args._nbatches==1 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemvPretransposed<cls_sve_gemv_fp32_mla_8VL, float, float>(args); } }, #endif @@ -80,8 +80,8 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] = { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); } }, #endif // __ARM_FEATURE_SVE && MMLA_FP32 @@ -91,22 +91,22 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] = { GemmMethod::GEMM_HYBRID, "sve_smallK_hybrid_fp32_mla_8x1VL", - [](const GemmArgs &args) { return args._Ksize <= 24 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize <= 24 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_fp32_mla_8x1VL, float, float>(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp32_mla_8x1VL", - nullptr, - [](const GemmArgs &args) { return (args._Nsize < 12); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (args._Nsize < 12); }, [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_fp32_mla_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); } }, #endif // __ARM_FEATURE_SVE @@ -144,8 +144,8 @@ GemmImplementation<float, float>::with_estimate( { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_fp32_mla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); } }, #endif // __ARM_FEATURE_SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp index f081558c40..60cf82f9c6 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp @@ -51,30 +51,30 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = { { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>8); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); } }, #endif { GemmMethod::GEMM_HYBRID, "sve_smallK_hybrid_s8s32_dot_8x1VL", - [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int32_t>(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8s32_dot_6x4VL", - [](const GemmArgs &args) { return args._Ksize>=16; }, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize>=16; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_dot_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); } }, #endif // SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp index d3a55eba6b..094b6fdff4 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp @@ -58,46 +58,46 @@ static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_mmla_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>8); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID_QUANTIZED, "sve_smallK_hybrid_s8s32_dot_8x1VL", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int8_t>(args, qp); } }, #ifdef SVE2 { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8qs_dot_6x4VL", - [](const GemmArgs &, const Requantize32 &qp) { return quant_hybrid_symmetric(qp); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_symmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8qa_dot_4x4VL", - [](const GemmArgs &, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID, "sve_hybrid_s8s32_dot_6x4VL", - nullptr, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_s8s32_dot_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); } }, #endif // SVE diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp index 9720c7d06e..be27b3a117 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp @@ -55,39 +55,39 @@ static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_meth { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_mmla_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>8); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID_QUANTIZED, "sve_smallK_hybrid_u8u32_dot_8x1VL", - [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint8_t>(args, qp); } }, #ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL) { GemmMethod::GEMM_HYBRID, "sve_hybrid_u8qa_dot_4x4VL", - [](const GemmArgs &, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_sve() && quant_hybrid_asymmetric(qp); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); } }, #endif { GemmMethod::GEMM_HYBRID, "sve_hybrid_u8u32_dot_6x4VL", - nullptr, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve(); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_dot_8x3VL", - [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); } }, #endif @@ -96,7 +96,7 @@ static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_meth GemmMethod::GEMM_INTERLEAVED, "a64_interleaved_u8u32_mmla_8x12", [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); } }, #endif diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp index d2ebe00f3b..fb41a9fc09 100644 --- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp +++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp @@ -51,30 +51,30 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = { { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_mmla_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>8); }, - nullptr, + [](const GemmArgs &args) { args._ci->has_sve() && return (args._Ksize>8); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); } }, #endif { GemmMethod::GEMM_HYBRID, "smallK_hybrid_u8u32_dot_8x1VL", - [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && args._Ksize<=64 && !args._indirect_input; }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint32_t>(args); } }, { GemmMethod::GEMM_HYBRID, "sve_hybrid_u8u32_dot_6x4VL", - nullptr, - [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); }, + [](const GemmArgs &args) { return args._ci->has_sve(); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN && (((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8))); }, [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); } }, { GemmMethod::GEMM_INTERLEAVED, "sve_interleaved_u8u32_dot_8x3VL", - [](const GemmArgs &args) { return (args._Ksize>4); }, - nullptr, + [](const GemmArgs &args) { return args._ci->has_sve() && (args._Ksize>4); }, + [](const GemmArgs &args) { return args._ci->get_cpu_model() != CPUModel::KLEIN; }, [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); } }, #endif diff --git a/src/runtime/CPUUtils.cpp b/src/runtime/CPUUtils.cpp index 63c9a8639c..82b42336e6 100644 --- a/src/runtime/CPUUtils.cpp +++ b/src/runtime/CPUUtils.cpp @@ -62,12 +62,27 @@ #define HWCAP_ASIMDDP (1 << 20) // NOLINT #endif /* HWCAP_ASIMDDP */ +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) // NOLINT +#endif /* HWCAP_SVE */ + namespace { using namespace arm_compute; #if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) +bool model_supports_sve(CPUModel model) +{ + switch(model) + { + case CPUModel::KLEIN: + return true; + default: + return false; + } +} + bool model_supports_dot(CPUModel model) { switch(model) @@ -75,6 +90,7 @@ bool model_supports_dot(CPUModel model) case CPUModel::GENERIC_FP16_DOT: case CPUModel::A55r1: case CPUModel::X1: + case CPUModel::KLEIN: return true; default: return false; @@ -89,6 +105,7 @@ bool model_supports_fp16(CPUModel model) case CPUModel::GENERIC_FP16_DOT: case CPUModel::A55r1: case CPUModel::X1: + case CPUModel::KLEIN: return true; default: return false; @@ -146,6 +163,9 @@ CPUModel midr_to_model(const unsigned int midr) case 0xd0d: model = CPUModel::GENERIC_FP16_DOT; break; + case 0xd46: + model = CPUModel::KLEIN; + break; default: model = CPUModel::GENERIC; break; @@ -369,11 +389,11 @@ namespace cpu void get_cpu_configuration(CPUInfo &cpuinfo) { #if !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) - bool cpuid = false; - bool hwcaps_fp16_support = false; - bool hwcaps_dot_support = false; - - const uint32_t hwcaps = getauxval(AT_HWCAP); + bool cpuid = false; + bool hwcaps_fp16_support = false; + bool hwcaps_dot_support = false; + bool hwcaps_sve = false; + const uint32_t hwcaps = getauxval(AT_HWCAP); if((hwcaps & HWCAP_CPUID) != 0) { @@ -390,6 +410,11 @@ void get_cpu_configuration(CPUInfo &cpuinfo) { hwcaps_dot_support = true; } + + if((hwcaps & HWCAP_SVE) != 0) + { + hwcaps_sve = true; + } #endif /* defined(__aarch64__) */ const unsigned int max_cpus = get_max_cpus(); @@ -408,17 +433,43 @@ void get_cpu_configuration(CPUInfo &cpuinfo) // We assume that the system does not have mixed architectures bool one_supports_dot = false; bool one_supports_fp16 = false; + bool one_supports_sve = false; for(const auto &v : percpu) { one_supports_dot = one_supports_dot || model_supports_dot(v); one_supports_fp16 = one_supports_fp16 || model_supports_fp16(v); + one_supports_sve = one_supports_sve || model_supports_sve(v); cpuinfo.set_cpu_model(j++, v); } cpuinfo.set_dotprod(one_supports_dot || hwcaps_dot_support); cpuinfo.set_fp16(one_supports_fp16 || hwcaps_fp16_support); -#else /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ + cpuinfo.set_sve(one_supports_sve || hwcaps_sve); +#elif(BARE_METAL) && defined(__aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ + cpuinfo.set_cpu_num(1); + const CPUModel cpumodel{ CPUModel::GENERIC }; + cpuinfo.set_cpu_model(0, cpumodel); + // Assume single CPU in bare metal mode. Just read the ID register and feature bits directly. + uint64_t fr0, pfr0, midr; + __asm __volatile( + "MRS %0, ID_AA64ISAR0_EL1\n" + "MRS %1, ID_AA64PFR0_EL1\n" + "MRS %2, midr_el1" + : "=r"(fr0), "=r"(pfr0), "=r"(midr)); + if((fr0 >> 44) & 0xf) + { + cpuinfo.set_dotprod(true); + } + if((pfr0 >> 16) & 0xf) + { + cpuinfo.set_fp16(true); + } + if((pfr0 >> 32) & 0xf) + { + cpuinfo.set_sve(true); + } +#else /* #elif(BARE_METAL) && defined(__aarch64__) */ ARM_COMPUTE_UNUSED(cpuinfo); -#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ +#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */ } unsigned int get_threads_hint() |