diff options
author | Gunes Bayir <gunes.bayir@arm.com> | 2024-03-06 09:58:40 +0000 |
---|---|---|
committer | Gunes Bayir <gunes.bayir@arm.com> | 2024-03-11 10:02:41 +0000 |
commit | 9167c9cd1c684218f76a3c0ec97574dd6f381b98 (patch) | |
tree | 7a9608f1f6861ad164697a0bbdc784be92a8d3e5 | |
parent | e77736fe4150648d2fd0649cf61c1bade928d69d (diff) | |
download | ComputeLibrary-9167c9cd1c684218f76a3c0ec97574dd6f381b98.tar.gz |
Prefer indirect Gemm vs. Direct convolution if supported
Indirect GEMM uses optimized assembly path while Direct Conv uses the fallback Acl kernel for convolution.
In certain cases, where the input tensor is large and filter size is greater than 7 (e.g. 9x9 filters), heuristics fall back to Direct Conv algorithm where it could still prefer the assembly path if the data layout is NHWC. This is more important when SME2 kernels are present.
Resolves: COMPMID-6900
Change-Id: Ia611c975eee0423615113fcaeaa8f9eef0421456
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11254
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Anitha Raj <Anitha.Raj@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r-- | docs/user_guide/release_version_and_change_log.dox | 1 | ||||
-rw-r--r-- | src/cpu/operators/CpuConv2d.cpp | 22 | ||||
-rw-r--r-- | tests/datasets/LargeConvolutionLayerDataset.h | 12 | ||||
-rw-r--r-- | tests/validation/NEON/ConvolutionLayer.cpp | 19 |
4 files changed, 48 insertions, 6 deletions
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox index b788957dda..21a5a368ad 100644 --- a/docs/user_guide/release_version_and_change_log.dox +++ b/docs/user_guide/release_version_and_change_log.dox @@ -43,6 +43,7 @@ If there is more than one release in a month then an extra sequential number is v24.04 Public major release - Optimize start-up time of @ref NEConvolutionLayer for some input configurations where GeMM is selected as the convolution algorithm + - Optimize @ref NEConvolutionLayer for input tensor size > 1e7 bytes and weight tensor height > 7 v24.02 Public major release - Replace template writer with compute kernel writer in dynamic fusion. diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp index 19311733db..26ca2ee783 100644 --- a/src/cpu/operators/CpuConv2d.cpp +++ b/src/cpu/operators/CpuConv2d.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2021, 2023 Arm Limited. + * Copyright (c) 2017-2021, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -209,12 +209,24 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *i } else { + const bool gemmDirectConv2d_validates = + bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)); + // SRGAN // Output might not be initialized when it is an internal tensor of the layer using the convolution - if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) && - (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + if (input->total_size() > 1e7 && weights->dimension(idx_h) > 7) { - return ConvolutionMethod::DIRECT; + // This configuration is memory demanding for GEMM method. GEMM_CONV2D which uses indirect convolution + // kernels underneath is the best option. + if (gemmDirectConv2d_validates) + { + return ConvolutionMethod::GEMM_CONV2D; + } + else if (bool(CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info))) + { + // NCHW data layout is not supported by GEMM_CONV2D + return ConvolutionMethod::DIRECT; + } } if (input->dimension(idx_c) < 16) { @@ -270,7 +282,7 @@ ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *i { return ConvolutionMethod::WINOGRAD; } - if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info))) + if (gemmDirectConv2d_validates) { return ConvolutionMethod::GEMM_CONV2D; } diff --git a/tests/datasets/LargeConvolutionLayerDataset.h b/tests/datasets/LargeConvolutionLayerDataset.h index 72f73ba6d9..c299f2460b 100644 --- a/tests/datasets/LargeConvolutionLayerDataset.h +++ b/tests/datasets/LargeConvolutionLayerDataset.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020, 2023 Arm Limited. + * Copyright (c) 2017-2020, 2023-2024 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -294,6 +294,16 @@ public: } }; +class VeryLargeConvolutionLayerDataset final : public ConvolutionLayerDataset +{ +public: + VeryLargeConvolutionLayerDataset() + { + // Tensor size > 1e7 bytes && weight dimensions > 7 + add_config(TensorShape(336U, 336U, 32U), TensorShape(9U, 9U, 32U, 64U), TensorShape(64U), TensorShape(168U, 168U, 64U), PadStrideInfo(2, 2, 4, 4)); + } +}; + class LargeGroupedConvolutionLayerDataset final : public ConvolutionLayerDataset { public: diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 62690c053e..7a9230d37a 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -109,6 +109,11 @@ const auto ActivationFunctionsDataset = make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 0.5f) }); +const auto NoActivation = make("ActivationInfo", +{ + ActivationLayerInfo(), +}); + const auto ActivationFunctionsDatasetNightly = make("ActivationInfo", { ActivationLayerInfo(), @@ -1201,6 +1206,20 @@ FIXTURE_DATA_TEST_CASE(RunPaddedWeights, NEGEMMConvolutionLayerPaddedWeightsFixt // Validate output validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32)); } + +// This very large shape test is required to test heuristic paths where the tensor size is > 1e7 bytes +// and weight dimensions larger than 7 +FIXTURE_DATA_TEST_CASE(RunVeryLarge, NEGEMMConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, + combine(datasets::VeryLargeConvolutionLayerDataset(), + framework::dataset::make("ReshapeWeights", { true }), + framework::dataset::make("DataType", DataType::F32), + framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }), + NoActivation)) +{ + // Validate output + validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32)); +} + TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float |