aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
diff options
context:
space:
mode:
authorJonathan Deakin <jonathan.deakin@arm.com>2024-01-24 09:15:38 +0000
committerRadu Salavat <radu.salavat@arm.com>2024-04-15 13:52:31 +0000
commita668f9f8a4eab405df0fe8dd58e7d9425bcf9640 (patch)
treedb16e6af9289897557a58755b88d2c337dcb8650 /src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
parent34bdffb288d6367cb6dca652ebed60c450854039 (diff)
downloadComputeLibrary-a668f9f8a4eab405df0fe8dd58e7d9425bcf9640.tar.gz
Add s8f32 kernels and dynamic QuantizationInfo
- Add support for QASYMM_SIGNED*QASYMM8_SIGNED->F32 in CpuGemmLowpMatrixMultiplyCore - Add s8f32 kernel using existing s8->s32 kernels with a new DequantizeFloat OutputStage, the structure is similar to Requantize32 but the opposite way around. - Add SME s8f32 kernels with integrated support for DequantizeFloat. - Add scale to CpuGemmLowpOffsetContributionKernel. - Add virtual dequantize scale to gemm_common, only implemented for gemm_interleaved. - Update year to 2024 in generate_build_files. - Add dynamic flag to QuantizationInfo which signals to operators that it can change after configuration - Add support for dynamic quantization in NEGEMMLowpMatrixMultiplyCore - Add dynamic quantization fixture by extending GEMMLowpGenericMatrixMultiplyCoreValidationFixture - Add GEMMLowpDequantizedMatrixMultiplyValidationFixture - Store k (number of cols of A) rather than k_offset in the offset contribution kernels so that we can recompute it when the other offsets change relates to: ONCPUML-1444 MLINFSW-439 Co-authored-by: Milos Puzovic <Milos.Puzovic@arm.com> Co-authored-by: David Mansell <David.Mansell@arm.com> Change-Id: I58a3acf2c09289a303e52eea6b336a696a5bc8da Signed-off-by: Jonathan Deakin <jonathan.deakin@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11022 Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp')
-rw-r--r--src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp53
1 files changed, 53 insertions, 0 deletions
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 01a74a5a56..7d85885654 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -540,6 +540,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
{
configure_indirect(a, b, d, gemm_info);
}
+
+ if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value)
+ {
+ // Output dequantization is just the two src scales multiplied together
+ _gemm_kernel_asm->set_dequantize_scale(a->quantization_info().uniform().scale *
+ b->quantization_info().uniform().scale);
+ }
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -630,6 +637,15 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
auto d = tensors.get_tensor(TensorType::ACL_DST);
ARM_COMPUTE_ERROR_ON_NULLPTR(a, d);
+ // Only update at runtime if the src quantization is dynamic
+ if (std::is_same<OutputStage, arm_gemm::DequantizeFloat>::value &&
+ (a->info()->quantization_info().is_dynamic() || b->info()->quantization_info().is_dynamic()))
+ {
+ // Output dequantization is just the two src scales multiplied together
+ _gemm_kernel_asm->set_dequantize_scale(a->info()->quantization_info().uniform().scale *
+ b->info()->quantization_info().uniform().scale);
+ }
+
int lda = a->info()->strides_in_bytes().y() / a->info()->element_size();
int ldb = 0;
const int ldd = d->info()->strides_in_bytes().y() / d->info()->element_size();
@@ -784,6 +800,39 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
}
template <typename TypeInput, typename TypeOutput>
+void create_arm_gemm_dequant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
+ const ITensorInfo *a,
+ const ITensorInfo *b,
+ const ITensorInfo *c,
+ ITensorInfo *d,
+ arm_gemm::Activation activation,
+ const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_UNUSED(activation);
+
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ const unsigned int num_threads = NEScheduler::get().num_threads();
+
+ arm_gemm::GemmConfig cfg;
+ cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+ info.fixed_format, info.fast_mode, info.accumulate, &cfg);
+
+ // Create arm_gemm fallback
+ auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::DequantizeFloat>>();
+
+ // Configure requantization info
+ const GEMMLowpOutputStageInfo os_info = info.output_stage;
+
+ arm_gemm::DequantizeFloat gemm_dequant_info{};
+ gemm_dequant_info = arm_gemm::DequantizeFloat(d->quantization_info().uniform().scale);
+
+ fallback->configure(a, b, c, d, args, info, gemm_dequant_info);
+ arm_gemm = std::move(fallback);
+}
+
+template <typename TypeInput, typename TypeOutput>
void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
const ITensorInfo *a,
const ITensorInfo *b,
@@ -1031,6 +1080,10 @@ void CpuGemmAssemblyDispatch::configure(
{
create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
}
+ else if (d->data_type() == DataType::F32)
+ {
+ create_arm_gemm_dequant<int8_t, float>(_arm_gemm, a, b, c, d, act, info);
+ }
else
{
create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, a, b, c, d, act, info);