From f1f1f87132690a8061801ef1a4638d637c780df7 Mon Sep 17 00:00:00 2001 From: Radu Salavat Date: Tue, 27 Feb 2024 18:32:26 +0000 Subject: Add in place summation to CPU GEMM kernels Instead of dispatching the sum postop for GEMM kernels to a separate kernel + add, that requires an extra destination sized allocation, plus 3 extra load/stores per element, just do it in the GEMM kernel. Resolves: ONCPUML-1442 Signed-off-by: Radu Salavat Co-authored-by: Milos Puzovic Change-Id: I7a1f2da3300875fa1ac88b705a34390969518077 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11298 Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- tests/validation/fixtures/GEMMFixture.h | 60 ++++++++++--- tests/validation/fixtures/GEMMLowpFixture.h | 127 +++++++++++++++++++++------- 2 files changed, 144 insertions(+), 43 deletions(-) (limited to 'tests/validation/fixtures') diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h index afde3d8067..94bedc83e1 100644 --- a/tests/validation/fixtures/GEMMFixture.h +++ b/tests/validation/fixtures/GEMMFixture.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2023 Arm Limited. + * Copyright (c) 2017-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -46,14 +46,14 @@ namespace test namespace validation { template -class GEMMValidationFixture : public framework::Fixture +class GEMMGenericValidationFixture : public framework::Fixture { public: - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type, bool accumulate=false) { ARM_COMPUTE_UNUSED(pretranspose); - _target = compute_target(shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type); - _reference = compute_reference(shape_a, shape_b, output_shape, alpha, beta, data_type); + _target = compute_target(shape_a, shape_b, shape_c, output_shape, alpha, beta, data_type, accumulate); + _reference = compute_reference(shape_a, shape_b, output_shape, alpha, beta, data_type, accumulate); } protected: @@ -80,7 +80,7 @@ protected: } TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_c, const TensorShape &output_shape, float alpha, float beta, - DataType data_type) + DataType data_type, bool accumulate=false) { // Create tensors TensorType a = create_tensor(shape_a, data_type, 1); @@ -99,7 +99,7 @@ protected: &dst, alpha, beta, GEMMInfo(false, false, false, (reinterpret_output_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d, false, GEMMLowpOutputStageInfo(), false, false, (reinterpret_input_as_3d - || reinterpret_output_as_3d))); + || reinterpret_output_as_3d), arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, false /* pretranspose_B */, accumulate)); ARM_COMPUTE_ASSERT(a.info()->is_resizable()); ARM_COMPUTE_ASSERT(b.info()->is_resizable()); ARM_COMPUTE_ASSERT(c.info()->is_resizable()); @@ -121,11 +121,14 @@ protected: // Fill tensors fill(AccessorType(a), 0); fill(AccessorType(b), 1); + if (accumulate) + { + fill(AccessorType(dst), 6); + } if(!disable_c) { fill(AccessorType(c), 2); } - // Run with variable inputs. if(run_twice) { @@ -145,7 +148,7 @@ protected: } SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &output_shape, float alpha, float beta, - DataType data_type) + DataType data_type, bool accumulate=false) { TensorShape shape_a_to_use = shape_a; if(reinterpret_input_as_3d) @@ -158,6 +161,7 @@ protected: SimpleTensor a{ shape_a_to_use, data_type, 1 }; SimpleTensor b{ shape_b, data_type, 1 }; SimpleTensor c{ output_shape, data_type, 1 }; + SimpleTensor dst{ output_shape, data_type, 1 }; // Fill reference fill(a, 0); @@ -211,17 +215,51 @@ protected: fill(c, 5); } + // Do in place summation + if (accumulate) + { + fill(dst, 6); + } + // Setting beta to 0 will effectively disable C for the // computation of the reference: alpha * A * B + 0 * C // Use transposed tensors if boolean enabled else use original tensors - auto r = reference::gemm((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta); - return r; + if (accumulate) + { + reference::gemm_accumulate((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta, dst); + return dst; + } + else + { + return reference::gemm((pretranspose_a) ? a_transposed : a, (pretranspose_b) ? b_transposed : b, c, alpha, disable_c ? 0.f : beta); + } } TensorType _target{}; SimpleTensor _reference{}; }; +template +class GEMMValidationFixture : protected GEMMGenericValidationFixture +{ +public: + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type) + { + GEMMGenericValidationFixture::setup(shape_a, shape_b, shape_c, output_shape, alpha, beta, pretranspose, data_type, false /*accumulate*/); + } +}; + +template +class GEMMAccumulateValidationFixture : protected GEMMGenericValidationFixture +{ +public: + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_c, TensorShape output_shape, float alpha, float beta, bool pretranspose, DataType data_type) + { + bool accumulate = true; + GEMMGenericValidationFixture::setup(shape_a, shape_b, shape_c, output_shape, alpha, beta, pretranspose, data_type, accumulate); + } +}; + template class GEMMMatrixMultiplyValidationFixture : public framework::Fixture { diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h index a65a1e6bd8..0f6908a468 100644 --- a/tests/validation/fixtures/GEMMLowpFixture.h +++ b/tests/validation/fixtures/GEMMLowpFixture.h @@ -30,6 +30,7 @@ #include "tests/framework/Fixture.h" #include "tests/validation/Validation.h" #include "tests/validation/reference/GEMMLowp.h" +#include "tests/validation/reference/ArithmeticOperations.h" #include #include @@ -42,16 +43,21 @@ namespace validation { namespace { - template void fill(U &&tensor, int i) +{ + library->fill_tensor_uniform(tensor, i); +} + +template +void fill_quantized(U &&tensor, int i) { ARM_COMPUTE_ASSERT(is_data_type_quantized(tensor.data_type())); library->fill_tensor_uniform(tensor, i); } template -void fill_bias_s32(U &&tensor, int i, int32_t min, int32_t max) +void fill_s32(U &&tensor, int i, int32_t min, int32_t max) { ARM_COMPUTE_ASSERT(tensor.data_type() == DataType::S32); std::uniform_int_distribution distribution(min, max); @@ -64,6 +70,11 @@ struct TensorFillInfo // Bias fill range. Default values are arbitrary int32_t min_bias {-20000}; int32_t max_bias {20000}; + + // Output fill range. Default values are arbitrary + int32_t min_output {-20000}; + int32_t max_output {20000}; + // Optional extra hash to randomize tensor filling int32_t hash {0}; }; @@ -71,7 +82,8 @@ struct TensorFillInfo template TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const QuantizationInfo& output_qinfo, DataType data_type_a = DataType::QASYMM8, DataType data_type_b = DataType::QASYMM8, - GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo() ) + GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), bool reshape_b_only_on_first_run = false, const TensorFillInfo& finfo = TensorFillInfo(), + bool accumulate = false) { ARM_COMPUTE_ASSERT(is_data_type_quantized_asymmetric(data_type_a)); ARM_COMPUTE_ASSERT(data_type_a == data_type_b); @@ -93,7 +105,9 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output FunctionType gemmlowp; gemmlowp.configure(&a, &b, is_fused ? &bias : nullptr, &output, GEMMInfo(false, false, reshape_b_only_on_first_run, (reinterpret_output_as_3d ? shape_output[2] : 0), reinterpret_input_as_3d, false, - output_stage)); + output_stage, false /*fp_mixed_precision*/, false /*fast_math*/, false /*broadcast_bias*/, + arm_compute::ActivationLayerInfo(), false /* fixed_format */, arm_compute::WeightFormat::UNSPECIFIED, + false /* pretranspose_B */, accumulate)); ARM_COMPUTE_ASSERT(a.info()->is_resizable()); ARM_COMPUTE_ASSERT(b.info()->is_resizable()); @@ -111,26 +125,32 @@ TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape ARM_COMPUTE_ASSERT(!output.info()->is_resizable()); // Fill tensors - fill(AccessorType(a), 0 + finfo.hash); - fill(AccessorType(b), 1 + finfo.hash); + fill_quantized(AccessorType(a), 0 + finfo.hash); + fill_quantized(AccessorType(b), 1 + finfo.hash); + + if (accumulate) + { + ARM_COMPUTE_ASSERT(accumulate != run_twice); + fill_s32(AccessorType(output), 6 + finfo.hash, finfo.min_output, finfo.max_output); + } if(is_fused) { ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); bias.allocator()->allocate(); ARM_COMPUTE_ASSERT(!bias.info()->is_resizable()); - fill_bias_s32(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias); + fill_s32(AccessorType(bias), 2 + finfo.hash, finfo.min_bias, finfo.max_bias); } // Run with variable inputs. if(run_twice) { gemmlowp.run(); - fill(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run - fill(AccessorType(b), 4 + finfo.hash); + fill_quantized(AccessorType(a), 3 + finfo.hash); // Fill tensors with new seed after run + fill_quantized(AccessorType(b), 4 + finfo.hash); if(is_fused) { - fill_bias_s32(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias); + fill_s32(AccessorType(bias), 5 + finfo.hash, finfo.min_bias, finfo.max_bias); } } @@ -168,8 +188,8 @@ SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, con SimpleTensor b_transposed{ shape_b_transposed, data_type_b, 1, b_qinfo }; // Fill reference - fill(a, 0 + finfo.hash); - fill(b, 1 + finfo.hash); + fill_quantized(a, 0 + finfo.hash); + fill_quantized(b, 1 + finfo.hash); // Transpose reference if required /* Note: Assuming the usual batch matmul dimensions A = (B x M x K), B = (B x K x N), if pretranspose_A is set to true, then A is assumed to be (B x K x M), @@ -189,11 +209,12 @@ SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, con // Run with variable inputs. const int32_t a_offset = a_qinfo.uniform().offset; const int32_t b_offset = b_qinfo.uniform().offset; + if(run_twice) { reference::gemmlowp_matrix_multiply_core((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset); - fill((pretranspose_A) ? a_transposed : a, 3 + finfo.hash); - fill((pretranspose_B) ? b_transposed : b, 4 + finfo.hash); + fill_quantized((pretranspose_A) ? a_transposed : a, 3 + finfo.hash); + fill_quantized((pretranspose_B) ? b_transposed : b, 4 + finfo.hash); } return reference::gemmlowp_matrix_multiply_core((pretranspose_A ? a_transposed : a), (pretranspose_B ? b_transposed : b), shape_output, a_offset, b_offset); @@ -201,35 +222,68 @@ SimpleTensor compute_gemmlowp_reference(const TensorShape &shape_a, con } // namespace template -class GEMMLowpMatrixMultiplyCoreValidationFixture : public framework::Fixture +class GEMMLowpGenericMatrixMultiplyCoreValidationFixture : public framework::Fixture { public: - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate=false) { const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset); const auto b_qinfo = QuantizationInfo(1.0f / 255, b_offset); - _target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo); - _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo); + TensorFillInfo finfo; + _target = compute_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate); + _reference = compute_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, finfo, accumulate); } protected: - TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo) + TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, const bool accumulate) { const auto output_qinfo = QuantizationInfo(); // No output stage - return compute_gemmlowp_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo); + return compute_gemmlowp_target(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, output_qinfo, + DataType::QASYMM8, DataType::QASYMM8, GEMMLowpOutputStageInfo(), false, finfo, accumulate); } - SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo) + SimpleTensor compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, const QuantizationInfo& a_qinfo, const QuantizationInfo& b_qinfo, const TensorFillInfo& finfo, bool accumulate) { - return compute_gemmlowp_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo); + SimpleTensor ref_output = compute_gemmlowp_reference(shape_a, shape_b, shape_output, a_qinfo, b_qinfo, + DataType::QASYMM8, DataType::QASYMM8, finfo); + + if (accumulate) + { + SimpleTensor output{ shape_output, DataType::S32, 1 }; + fill_s32(output, 6 + finfo.hash, finfo.min_output, finfo.max_output); + reference::arithmetic_operation(reference::ArithmeticOperation::ADD, output, ref_output, output, ConvertPolicy::SATURATE); + return output; + } + + return ref_output; } TensorType _target{}; SimpleTensor _reference{}; }; +template +class GEMMLowpMatrixMultiplyCoreValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture +{ +public: + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset) + { + GEMMLowpGenericMatrixMultiplyCoreValidationFixture::setup(shape_a, shape_b, shape_output, a_offset, b_offset, false /* accumulate */); + } +}; + +template +class GEMMLowpMatrixMultiplyAccumulateValidationFixture : protected GEMMLowpGenericMatrixMultiplyCoreValidationFixture +{ +public: + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset) + { + GEMMLowpGenericMatrixMultiplyCoreValidationFixture::setup(shape_a, shape_b, shape_output, a_offset, b_offset, true /* accumulate */); + } +}; + template -class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture : public framework::Fixture +class GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture { public: /** Dynamically initialize the quantization info with saturation awareness @@ -363,16 +417,16 @@ protected: TensorShape bias_shape(shape_b[0]); SimpleTensor bias{ bias_shape, DataType::S32, 1 }; - (run_twice) ? fill_bias_s32(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill_bias_s32(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target + (run_twice) ? fill_s32(bias, 5 + finfo.hash, finfo.min_bias, finfo.max_bias) : fill_s32(bias, 2 + finfo.hash, finfo.min_bias, finfo.max_bias); // Fill bias with same seed as last run of gemmlowp_target switch(output_stage.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN: - return reference::gemmlowp_quantize_down_scale(output, bias, + return reference::gemmlowp_quantize_down_scale(output, bias, output_stage.gemmlowp_offset, output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound); break; case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: - return reference::gemmlowp_quantize_down_scale_by_fixedpoint(output, bias, + return reference::gemmlowp_quantize_down_scale_by_fixedpoint(output, bias, output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound); break; default: @@ -384,15 +438,24 @@ protected: SimpleTensor _reference{}; }; -template -class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public - GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture +template +class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture +{ +public: + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run) + { + GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture::setup(shape_a, shape_b, + shape_output, output_stage_type, data_type, reshape_b_only_on_first_run); + } +}; + +template +class GEMMLowpBatchedMatrixMultiplyCoreFusedOffsetOutputFixture : public GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture { public: - void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type) + void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, GEMMLowpOutputStageType output_stage_type, DataType data_type, bool reshape_b_only_on_first_run) { - GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture::setup(shape_a, shape_b, - shape_output, output_stage_type, data_type, false /* reshape_b_only_on_first_run */); + GEMMLowpGenericMatrixMultiplyCoreFusedOffsetOutputValidationFixture::setup(shape_a, shape_b, shape_output, output_stage_type, data_type, reshape_b_only_on_first_run); } }; -- cgit v1.2.1