aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/operators/internal
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/operators/internal')
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp70
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.h3
2 files changed, 64 insertions, 9 deletions
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index efe2a7a67e..a4c856bb8f 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -540,6 +540,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
{
configure_indirect(a, b, d, gemm_info);
}
+
+ if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value)
+ {
+ // Output dequantization is just the two src scales multiplied together
+ _gemm_kernel_asm->set_dequantize_scale(a->quantization_info().uniform().scale *
+ b->quantization_info().uniform().scale);
+ }
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -630,6 +637,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
auto d = tensors.get_tensor(TensorType::ACL_DST);
ARM_COMPUTE_ERROR_ON_NULLPTR(a, d);
+ // Only update at runtime if the src quantization is dynamic
+ if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value &&
+ (a->info()->quantization_info().is_dynamic() || b->info()->quantization_info().is_dynamic()))
+ {
+ // Output dequantization is just the two src scales multiplied together
+ _gemm_kernel_asm->set_dequantize_scale(a->info()->quantization_info().uniform().scale *
+ b->info()->quantization_info().uniform().scale);
+ }
+
int lda = a->info()->strides_in_bytes().y() / a->info()->element_size();
int ldb = 0;
const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size();
@@ -775,7 +791,7 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
- info.fixed_format, info.fast_mode, &cfg);
+ info.fixed_format, info.fast_mode, info.accumulate, &cfg);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -784,6 +800,39 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
}
template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::Activation activation,
+ const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_UNUSED(activation);
+
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ arm_gemm::GemmConfig cfg;
+ cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+ info.fixed_format, info.fast_mode, info.accumulate, &cfg);
+
+ // Create arm_gemm fallback
+ auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>();
+
+ // Configure requantization info
+ const GEMMLowpOutputStageInfo os_info = info.output_stage;
+
+ arm_gemm::DequantizeFloat gemm_dequant_info{};
+ gemm_dequant_info = arm_gemm::DequantizeFloat(d->quantization_info().uniform().scale);
+
+ fallback->configure(a, b, c, d, args, info, gemm_dequant_info);
+ arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
const ITensorInfo *a,
const ITensorInfo *b,
@@ -800,7 +849,7 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
arm_gemm::GemmConfig cfg;
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
- info.fixed_format, info.fast_mode, &cfg);
+ info.fixed_format, info.fast_mode, info.accumulate, &cfg);
// Create arm_gemm fallback
auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -855,8 +904,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
- info.fixed_format, info.fast_mode, &cfg);
-
+ info.fixed_format, info.fast_mode, info.accumulate, &cfg);
// TODO: Incorporate info.transpose_b COMPMID-6595
switch (a->data_type())
{
@@ -897,6 +945,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
}
break;
#endif /* __aarch64__ */
+
#if defined(ARM_COMPUTE_ENABLE_BF16)
case DataType::BFLOAT16:
{
@@ -915,13 +964,14 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
break;
}
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#if defined(ENABLE_FP16_KERNELS)
case DataType::F16:
ARM_COMPUTE_RETURN_ERROR_ON_MSG(
!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
"We could not find an optimized kernel for F16 input and F16 output");
break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ENABLE_FP16_KERNELS */
default:
ARM_COMPUTE_RETURN_ERROR_ON_MSG(true, "Usupported type. Could not find a kernel");
break;
@@ -1032,6 +1082,10 @@ void CpuGemmAssemblyDispatch::configure(
{
create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
}
+ else if (d->data_type() == DataType::F32)
+ {
+ create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info);
+ }
else
{
create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);
@@ -1050,11 +1104,11 @@ void CpuGemmAssemblyDispatch::configure(
}
break;
#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#ifdef ENABLE_FP16_KERNELS
case DataType::F16:
create_arm_gemm<float16_t, float16_t>(_arm_gemm, a, b, c, d, act, info);
break;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+#endif /* ENABLE_FP16_KERNELS */
default:
break;
}
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index 671a222fed..44c5c189a5 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2023 Arm Limited.
+ * Copyright (c) 2018-2024 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -57,6 +57,7 @@ struct AsmGemmInfo
bool fixed_format{false};
arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
bool reshape_b_only_on_first_run{true};
+ bool accumulate{false};
/** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array
* @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for
* by the reshape_b_only_on_first_run flag