diff options
Diffstat (limited to 'src/cpu/operators/internal')
-rw-r--r-- | src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp | 70 | ||||
-rw-r--r-- | src/cpu/operators/internal/CpuGemmAssemblyDispatch.h | 3 |
2 files changed, 64 insertions, 9 deletions
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp index efe2a7a67e..a4c856bb8f 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp @@ -540,6 +540,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo * { configure_indirect(a, b, d, gemm_info); } + + if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value) + { + // Output dequantization is just the two src scales multiplied together + _gemm_kernel_asm->set_dequantize_scale(a->quantization_info().uniform().scale * + b->quantization_info().uniform().scale); + } } template <typename TypeInput, typename TypeOutput, class OutputStage> @@ -630,6 +637,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors) auto d = tensors.get_tensor(TensorType::ACL_DST); ARM_COMPUTE_ERROR_ON_NULLPTR(a, d); + // Only update at runtime if the src quantization is dynamic + if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value && + (a->info()->quantization_info().is_dynamic() || b->info()->quantization_info().is_dynamic())) + { + // Output dequantization is just the two src scales multiplied together + _gemm_kernel_asm->set_dequantize_scale(a->info()->quantization_info().uniform().scale * + b->info()->quantization_info().uniform().scale); + } + int lda = a->info()->strides_in_bytes().y() / a->info()->element_size(); int ldb = 0; const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size(); @@ -775,7 +791,7 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, - info.fixed_format, info.fast_mode, &cfg); + info.fixed_format, info.fast_mode, info.accumulate, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>(); @@ -784,6 +800,39 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge } template <typename TypeInput, typename TypeOutput> +void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + ITensorInfo *d, + arm_gemm::Activation activation, + const AsmGemmInfo &info) +{ + ARM_COMPUTE_UNUSED(activation); + + Params p = extract_parameters(a, b, d, info); + const CPUInfo &ci = NEScheduler::get().cpu_info(); + const unsigned int num_threads = NEScheduler::get().num_threads(); + + arm_gemm::GemmConfig cfg; + cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); + arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, + info.fixed_format, info.fast_mode, info.accumulate, &cfg); + + // Create arm_gemm fallback + auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>(); + + // Configure requantization info + const GEMMLowpOutputStageInfo os_info = info.output_stage; + + arm_gemm::DequantizeFloat gemm_dequant_info{}; + gemm_dequant_info = arm_gemm::DequantizeFloat(d->quantization_info().uniform().scale); + + fallback->configure(a, b, c, d, args, info, gemm_dequant_info); + arm_gemm = std::move(fallback); +} + +template <typename TypeInput, typename TypeOutput> void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, const ITensorInfo *a, const ITensorInfo *b, @@ -800,7 +849,7 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> & arm_gemm::GemmConfig cfg; cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, - info.fixed_format, info.fast_mode, &cfg); + info.fixed_format, info.fast_mode, info.accumulate, &cfg); // Create arm_gemm fallback auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>(); @@ -855,8 +904,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format); arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format); arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, - info.fixed_format, info.fast_mode, &cfg); - + info.fixed_format, info.fast_mode, info.accumulate, &cfg); // TODO: Incorporate info.transpose_b COMPMID-6595 switch (a->data_type()) { @@ -897,6 +945,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected } break; #endif /* __aarch64__ */ + #if defined(ARM_COMPUTE_ENABLE_BF16) case DataType::BFLOAT16: { @@ -915,13 +964,14 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected break; } #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + +#if defined(ENABLE_FP16_KERNELS) case DataType::F16: ARM_COMPUTE_RETURN_ERROR_ON_MSG( !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})), "We could not find an optimized kernel for F16 input and F16 output"); break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* ENABLE_FP16_KERNELS */ default: ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel"); break; @@ -1032,6 +1082,10 @@ void CpuGemmAssemblyDispatch::configure( { create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info); } + else if (d->data_type() == DataType::F32) + { + create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info); + } else { create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info); @@ -1050,11 +1104,11 @@ void CpuGemmAssemblyDispatch::configure( } break; #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ -#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +#ifdef ENABLE_FP16_KERNELS case DataType::F16: create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info); break; -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +#endif /* ENABLE_FP16_KERNELS */ default: break; } diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h index 671a222fed..44c5c189a5 100644 --- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h +++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2023 Arm Limited. + * Copyright (c) 2018-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -57,6 +57,7 @@ struct AsmGemmInfo bool fixed_format{false}; arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED}; bool reshape_b_only_on_first_run{true}; + bool accumulate{false}; /** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array * @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for * by the reshape_b_only_on_first_run flag |