From 5e281814c5110724d99fe8ee64bdf42ef2c31bce Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 6 Jul 2021 13:19:41 +0100 Subject: Fix manual LOOP_UNROLLING The issue is caused by the number of iterations passed to LOOP_UNROLLING. When we use the manual LOOP_UNROLLING, the number of iterations must be less than or equal to 128. To overcome this problem, we create a utility function to check if any of the critical iterations (kernel dimensions) are beyond that limit. If so, the utility function, disable the manual loop unrolling. Resolves COMPMID-4609 Change-Id: I7221c967609e462a5abd1cbb74e2a120f344fcb3 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5913 Reviewed-by: Georgios Pinitas Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/CL/CLHelpers.h | 13 +++++++++++ src/core/CL/CLHelpers.cpp | 12 ++++++++++ .../CLDepthwiseConvolutionLayerNativeKernel.cpp | 11 +++++++-- tests/datasets/DepthwiseConvolutionLayerDataset.h | 10 +++++++++ tests/validation/CL/DepthwiseConvolutionLayer.cpp | 13 +++++++++++ .../fixtures/DepthwiseConvolutionLayerFixture.h | 26 ++++++++++++---------- 6 files changed, 71 insertions(+), 14 deletions(-) diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h index 180211c558..ce7e675b56 100644 --- a/arm_compute/core/CL/CLHelpers.h +++ b/arm_compute/core/CL/CLHelpers.h @@ -46,6 +46,9 @@ enum class DataType; /** Max vector width of an OpenCL vector */ static constexpr unsigned int max_cl_vector_width = 16; +/** Max number of manual loop unrolling */ +static constexpr int max_manual_loop_unrolling = 128; + /** Translates a tensor data type to the appropriate OpenCL type. * * @param[in] dt @ref DataType to be translated to OpenCL type. @@ -244,5 +247,15 @@ void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint); */ bool export_weights_to_cl_image(const ITensorInfo *tensor); +/* Helper function to force unroll with pragma when any of the input values (iterations) are greater than @ref max_manual_loop_unrolling + * + * This function passes UNROLL_WITH_PRAGMA at compile time when any of the input values are greater than @ref max_manual_loop_unrolling + * + * @param[in] built_opts OpenCL kernel build options + * @param[in] values Input values (iterations) + * + */ +void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list values); + } // namespace arm_compute #endif /* ARM_COMPUTE_CLHELPERS_H */ diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index 3323929742..5c53455eeb 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -466,4 +466,16 @@ bool export_weights_to_cl_image(const ITensorInfo *tensor) return true; } +void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list values) +{ + for(const int value : values) + { + if(value > max_manual_loop_unrolling) + { + built_opts.add_option("-DUNROLL_WITH_PRAGMA"); + return; + } + } +} + } // namespace arm_compute diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp index 4bde303f1e..1437b5bebb 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp @@ -83,7 +83,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation }; - const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info); const bool is_quantized = is_data_type_quantized(input->data_type()); @@ -237,9 +237,16 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext & build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y())); build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); - build_opts.add_option("-DM0_A=" + support::cpp11::to_string(weights->info()->dimension(1) + m0 - 1)); + build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_input->info()->dimension(0) % n0)); build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION"); + + // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll + set_unroll_with_pragma(build_opts, { static_cast(_weights->info()->dimension(1) + m0 - 1), + static_cast(_weights->info()->dimension(1)), + static_cast(_weights->info()->dimension(2)) + }); + if(biases != nullptr) { build_opts.add_option(std::string("-DHAS_BIAS")); diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h index 3b17910eac..82ea40ff52 100644 --- a/tests/datasets/DepthwiseConvolutionLayerDataset.h +++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h @@ -155,6 +155,16 @@ public: } }; +/** Dataset containing large kernel size for generic depthwise convolution. */ +class LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset final : public DepthwiseConvolutionLayerDataset +{ +public: + LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset() + { + add_config(TensorShape(6U, 210U, 8U), Size2D(4U, 194U), PadStrideInfo(1, 1, 0, 0)); + } +}; + /** Dataset containing small, 3x3 depthwise convolution shapes. */ class SmallDepthwiseConvolutionLayerDataset3x3 final : public DepthwiseConvolutionLayerDataset { diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp index 79a2678b44..b2cff2b792 100644 --- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp +++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp @@ -382,6 +382,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture, { validate(CLAccessor(_target), _reference, tolerance_f32); } + TEST_SUITE(Dilation) FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, @@ -418,6 +419,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture, { validate(CLAccessor(_target), _reference, tolerance_f32); } + FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(), large_depth_multipliers), framework::dataset::make("DataType", @@ -428,6 +430,17 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture, validate(CLAccessor(_target), _reference, tolerance_f32); } +FIXTURE_DATA_TEST_CASE_NEW(RunLargeKernelSize, CLDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(), + framework::dataset::make("DepthMultiplier", { 1 })), + framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("DataLayout", { DataLayout::NHWC })), + ActivationFunctionsDataset)) +{ + validate(CLAccessor(_target), _reference, tolerance_f32); +} + TEST_SUITE(Dilation) FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(), depth_multipliers), diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h index 0e02ae28ca..ddbab7fe13 100644 --- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h +++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h @@ -113,6 +113,13 @@ public: target_to_use = &_target; } + add_padding_x({ &_src, &_biases }, _data_layout); + add_padding_x({ &_weights }, _data_layout, true); + if(!_in_place) + { + add_padding_x({ &_target }, _data_layout); + } + // Create Depthwise Convolution configure function _dwc.configure(&_src, &_weights, &_biases, target_to_use, _pad_stride_info, _depth_multiplier, _act_info, _dilation); @@ -124,12 +131,6 @@ public: void allocate_and_run_target() { - add_padding_x({ &_src, &_weights, &_biases }, _data_layout); - if(!_in_place) - { - add_padding_x({ &_target }, _data_layout); - } - // Allocate tensors _src.allocator()->allocate(); _weights.allocator()->allocate(); @@ -317,6 +318,10 @@ public: _biases = create_tensor(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout); _target = create_tensor(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout); + add_padding_x({ &_src, &_biases, &_target }, _data_layout); + add_padding_x({ &_weights }, _data_layout, true); + add_padding_y({ &_src, &_target }, _data_layout); + // Create Depthwise Convolution configure function const ConvolutionInfo info { @@ -332,9 +337,6 @@ public: void allocate_and_run_target() { - add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout); - add_padding_y({ &_src, &_target }, _data_layout); - // Allocate tensors _src.allocator()->allocate(); _weights.allocator()->allocate(); @@ -482,6 +484,9 @@ public: _conv_info, _depth_multiplier, _act_info, _dilation }; + add_padding_x({ &_src, &_biases, &_target }, _data_layout); + add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used + // Create Depthwise Convolution configure function _dwc.configure(&_src, &_weights, &_biases, target_to_use, dwc_info, conv_kernel_info); @@ -493,9 +498,6 @@ public: void allocate_and_run_target() { - add_padding_x({ &_src, &_biases, &_target }, _data_layout); - add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used - // Allocate tensors _src.allocator()->allocate(); _weights.allocator()->allocate(); -- cgit v1.2.1