From 4ee8b1599dbaf7634d25607fa5ac96ba3dc6b0f2 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Fri, 16 Jul 2021 16:16:43 +0100 Subject: Update GEMM assembly kernels - Introduce Fp32 kernels with internal calculations in Bfloat16 when fast_mode is enabled - Improve kernel selection heuristics Signed-off-by: Georgios Pinitas Change-Id: I68a9e7e862b6fd2721b46e0d7cc791091c4ab279 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5965 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- src/runtime/cpu/operators/CpuGemm.cpp | 1 + src/runtime/cpu/operators/CpuGemmConvolution.cpp | 9 +++++---- src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp | 1 + src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp | 1 + src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp | 10 +++++----- src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h | 1 + 6 files changed, 14 insertions(+), 9 deletions(-) (limited to 'src/runtime/cpu/operators') diff --git a/src/runtime/cpu/operators/CpuGemm.cpp b/src/runtime/cpu/operators/CpuGemm.cpp index 9a4d171ce6..c6abe1f893 100644 --- a/src/runtime/cpu/operators/CpuGemm.cpp +++ b/src/runtime/cpu/operators/CpuGemm.cpp @@ -48,6 +48,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d(); asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); asm_info.activation_info = info.activation_info(); + asm_info.fast_mode = info.fast_math(); return asm_info; } diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp index a0424b1c63..fcdf8aa8f6 100644 --- a/src/runtime/cpu/operators/CpuGemmConvolution.cpp +++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp @@ -66,7 +66,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo // Create GEMMInfo structure const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); + false, GEMMLowpOutputStageInfo(), false, false, false, act_info); // Supported activations in GEMM const std::set supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, @@ -115,7 +115,7 @@ void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info); _mm_gemmlowp = std::make_unique(); - _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info)); + _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, false, act_info)); auto mm_mem_req = _mm_gemmlowp->workspace(); for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) @@ -146,7 +146,7 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo // Create GEMMInfo structure const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, - false, GEMMLowpOutputStageInfo(), false, false, act_info); + false, GEMMLowpOutputStageInfo(), false, false, false, act_info); if(is_quantized) { @@ -186,7 +186,8 @@ Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo std::unique_ptr weights_qa = weights->clone(); input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset)); weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset)); - return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info)); + return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, false, + act_info)); } else { diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp index c2e9f24ff6..10eece99eb 100644 --- a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp +++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp @@ -86,6 +86,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect asm_info.padding_left = info.conv_info.pad_left(); asm_info.padding_value = 0.f; asm_info.negated_offsets = false; + asm_info.fast_mode = info.enable_fast_math; return asm_info; } } // namespace diff --git a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp index 651ce436a0..56eb4fbb87 100644 --- a/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp +++ b/src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp @@ -63,6 +63,7 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info) asm_info.depth_output_gemm3d = info.depth_output_gemm3d(); asm_info.activation_info = info.activation_info(); asm_info.output_stage = info.gemmlowp_output_stage(); + asm_info.fast_mode = info.fast_math(); return asm_info; } diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index 79ea1cb5a7..bbbd5ac458 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -542,7 +542,7 @@ void create_arm_gemm(std::unique_ptr &arm_ge const CPUInfo &ci = NEScheduler::get().cpu_info(); unsigned int num_threads = NEScheduler::get().num_threads(); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode); // Create arm_gemm fallback auto fallback = std::make_unique>(); @@ -556,11 +556,11 @@ void create_arm_gemm_quant(std::unique_ptr & arm_gemm::Activation activation, const AsmGemmInfo &info) { ARM_COMPUTE_UNUSED(activation); - Params p = extract_parameters(a, b, d, info); - const CPUInfo &ci = NEScheduler::get().cpu_info(); - unsigned int num_threads = NEScheduler::get().num_threads(); + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); - arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fast_mode); // Create arm_gemm fallback auto fallback = std::make_unique>(); diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 355273adeb..88cfed002a 100644 --- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -51,6 +51,7 @@ struct AsmGemmInfo int64_t padding_top{ 0 }; int64_t padding_left{ 0 }; float padding_value{ 0.f }; + bool fast_mode{ false }; }; /** Assembly kernel glue */ -- cgit v1.2.1