diff options
author | SiCong Li <sicong.li@arm.com> | 2023-10-17 17:38:57 +0100 |
---|---|---|
committer | SiCong Li <sicong.li@arm.com> | 2023-11-08 09:49:56 +0000 |
commit | c5ab4df0c11dc66db47f2070edc719923af3367e (patch) | |
tree | c04bdac32528e628b2a9b9a1c1653e300328fc1b /tests | |
parent | 4a9dbedfbfa66c2612c7461e60cd867b8aea825b (diff) | |
download | ComputeLibrary-c5ab4df0c11dc66db47f2070edc719923af3367e.tar.gz |
Optimize CpuGemmConv2d start-up time
When weight has no holes, we can replace CpuWeightsReshapeKernel with:
- Collapse by reinterpreting weight's 3 spatial dimensions
- Perform CpuTranspose
For more details see the documentation in
src/cpu/operators/CpuGemmConv2d.cpp
This is one optimization since the CpuTranspose is better performing
than CpuWeightsReshapeKernel
A second optimization is to fuse this transpose with other weight
transformations (e.g. pretranspose_B_array in CpuGemmAssemblyDispatch)
However this second optimization depends on how the underlying gemm
methods (the fall back path: CpuGemmMatrixMultiplyKernel or the assembly
path: CpuGemmAssemblyDispatch) chooses to fuse the transpose.
Therefore, this patch moves the transpose down from CpuGemmConv2d, to
the individual gemm operators where the fusion decision needs to be
made, by passing an extra "transpose_b" flag to CpuGemm
New transpose_b flag in different scopes (they are all the same, but
with different names because pretranspose_b has a different meaning in
GemmAssemblyDispatch):
GEMMInfo::pretranspose_B -> AsmGemmInfo::transpose_b
New auxilliary tensors holding the transposed b result:
- CpuGemm optimized path: CpuGemmAssemblyDispatch::PrePretransposedB
- CpuGemm fallback path: CpuGemm::PreTransposedRHS
Note that this patch does not yet have the second optimization
(COMPMID-6595), but it prepares for it.
Relates to COMPMID-6595
Resolves COMPMID-6499
Change-Id: I999a2da9da4b2b15369a3cc06d7872c86e0190ea
Signed-off-by: SiCong Li <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10526
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'tests')
-rw-r--r-- | tests/validation/NEON/ConvolutionLayer.cpp | 18 | ||||
-rw-r--r-- | tests/validation/fixtures/ConvolutionLayerFixture.h | 29 |
2 files changed, 42 insertions, 5 deletions
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 7a274906a6..98a5be5484 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -1032,6 +1032,8 @@ TEST_SUITE(GEMMConvolutionLayer) template <typename T> using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>; template <typename T> +using NEGEMMConvolutionLayerPaddedWeightsFixture = ConvolutionValidationPaddedWeightsFixture<Tensor, Accessor, NEConvolutionLayer, T>; +template <typename T> using NEGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T, true>; /** Test case for memory injection in @ref cpu::CpuGemmConv2d. @@ -1184,9 +1186,25 @@ FIXTURE_DATA_TEST_CASE(RunMixedDataLayout, NEGEMMConvolutionLayerMixedDataLayout // Validate output validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32)); } +/** Padded weights + * CpuGemmConv2d uses two different paths for reshaping the weights based on if the weight tensor has holes (a common + * way to have "holes" in tensor is via extended paddings) + * + * We only need to test the padded weight path here on a single floating data type and a single layout, because the fallback path is agnostic of them + */ +FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixture<float>, framework::DatasetMode::ALL, combine(datasets::SmallConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true }), + framework::dataset::make("DataType", DataType::F32), + framework::dataset::make("DataLayout", { DataLayout::NHWC }) + )) +{ + // Validate output + validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32)); +} TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float +// TODO: COMPMID-6596 Extend quantized tests with at least one suite where the weight is padded (the legacy case, see floating point's RunPaddedWeights) template <typename T> using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>; template <typename T> diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h index 2051add225..0622e5e6f0 100644 --- a/tests/validation/fixtures/ConvolutionLayerFixture.h +++ b/tests/validation/fixtures/ConvolutionLayerFixture.h @@ -123,7 +123,7 @@ public: public: void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type, DataType weights_data_type, DataLayout data_layout, QuantizationInfo quantization_info, QuantizationInfo weight_quantization_info, ActivationLayerInfo act_info, - bool mixed_layout = false, PaddingList pre_pad_layer = PaddingList({})) + bool mixed_layout = false, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false) { // This hash is used by random generators. There may be hash collisions but // this is intentional as it's a very easy way to make the the current @@ -151,7 +151,7 @@ public: _use_dynamic_output_quant = true; } - _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info, pre_pad_layer); + _target = compute_target(input_shape, weights_shape, bias_shape, output_shape, info, reshape_weights, dilation, act_info, pre_pad_layer, padded_weights); _reference = compute_reference(input_shape, weights_shape, bias_shape, output_shape, info, dilation, act_info, pre_pad_layer); } @@ -267,7 +267,7 @@ protected: // given input is IN nchw format TensorType compute_target(TensorShape input_shape, TensorShape weights_shape, const TensorShape &bias_shape, TensorShape output_shape, const PadStrideInfo &info, - bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({})) + bool reshape_weights, const Size2D &dilation, const ActivationLayerInfo act_info, PaddingList pre_pad_layer = PaddingList({}), bool padded_weights = false) { ARM_COMPUTE_ERROR_ON((input_shape[2] % weights_shape[2]) != 0); @@ -335,8 +335,13 @@ protected: ARM_COMPUTE_ASSERT(weights.info()->is_resizable()); ARM_COMPUTE_ASSERT(bias.info()->is_resizable()); ARM_COMPUTE_ASSERT(dst.info()->is_resizable()); - - add_padding_x({ &src, &weights, &bias, &dst }, _data_layout); + // Test "add padding after configure" behavior. This behavior should not affect the correctness + add_padding_x({ &src, &bias, &dst }, _data_layout); + // Padding weights may affect code path in some backends + if (padded_weights) + { + add_padding_x({ &weights }, _data_layout); + } // Allocate tensors src.allocator()->allocate(); @@ -437,6 +442,19 @@ public: }; template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false> +class ConvolutionValidationPaddedWeightsFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T> +{ +public: + void setup(TensorShape input_shape, TensorShape weights_shape, TensorShape bias_shape, TensorShape output_shape, PadStrideInfo info, Size2D dilation, bool reshape_weights, DataType data_type, + DataLayout data_layout) + { + ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T>::setup(input_shape, weights_shape, bias_shape, output_shape, info, dilation, reshape_weights, + data_type, data_type, data_layout, + QuantizationInfo(), QuantizationInfo(), ActivationLayerInfo(), mixed_layout, PaddingList({}), true); + } +}; + +template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool mixed_layout = false> class ConvolutionValidationWithPaddingFixture : public ConvolutionValidationGenericFixture<TensorType, AccessorType, FunctionType, T, T> { public: @@ -481,6 +499,7 @@ public: } }; + #ifdef ARM_COMPUTE_ENABLE_FIXED_FORMAT_KERNELS inline TensorInfo prepare_weights(const TensorInfo tensor_info, const arm_compute::WeightFormat weight_format) { |