aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2021-07-06 13:19:41 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-06 16:38:19 +0000
commit5e281814c5110724d99fe8ee64bdf42ef2c31bce (patch)
tree008a57c80f5b846265b0339f6e3a9f7876fa8922
parent900289936c458eff95499e0a0eaba989a27aaa4d (diff)
downloadComputeLibrary-5e281814c5110724d99fe8ee64bdf42ef2c31bce.tar.gz
Fix manual LOOP_UNROLLING
The issue is caused by the number of iterations passed to LOOP_UNROLLING. When we use the manual LOOP_UNROLLING, the number of iterations must be less than or equal to 128. To overcome this problem, we create a utility function to check if any of the critical iterations (kernel dimensions) are beyond that limit. If so, the utility function, disable the manual loop unrolling. Resolves COMPMID-4609 Change-Id: I7221c967609e462a5abd1cbb74e2a120f344fcb3 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5913 Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/CL/CLHelpers.h13
-rw-r--r--src/core/CL/CLHelpers.cpp12
-rw-r--r--src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp11
-rw-r--r--tests/datasets/DepthwiseConvolutionLayerDataset.h10
-rw-r--r--tests/validation/CL/DepthwiseConvolutionLayer.cpp13
-rw-r--r--tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h26
6 files changed, 71 insertions, 14 deletions
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 180211c558..ce7e675b56 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -46,6 +46,9 @@ enum class DataType;
/** Max vector width of an OpenCL vector */
static constexpr unsigned int max_cl_vector_width = 16;
+/** Max number of manual loop unrolling */
+static constexpr int max_manual_loop_unrolling = 128;
+
/** Translates a tensor data type to the appropriate OpenCL type.
*
* @param[in] dt @ref DataType to be translated to OpenCL type.
@@ -244,5 +247,15 @@ void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint);
*/
bool export_weights_to_cl_image(const ITensorInfo *tensor);
+/* Helper function to force unroll with pragma when any of the input values (iterations) are greater than @ref max_manual_loop_unrolling
+ *
+ * This function passes UNROLL_WITH_PRAGMA at compile time when any of the input values are greater than @ref max_manual_loop_unrolling
+ *
+ * @param[in] built_opts OpenCL kernel build options
+ * @param[in] values Input values (iterations)
+ *
+ */
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values);
+
} // namespace arm_compute
#endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 3323929742..5c53455eeb 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -466,4 +466,16 @@ bool export_weights_to_cl_image(const ITensorInfo *tensor)
return true;
}
+void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
+{
+ for(const int value : values)
+ {
+ if(value > max_manual_loop_unrolling)
+ {
+ built_opts.add_option("-DUNROLL_WITH_PRAGMA");
+ return;
+ }
+ }
+}
+
} // namespace arm_compute
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index 4bde303f1e..1437b5bebb 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
@@ -83,7 +83,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
}
const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
- const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+ const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
const bool is_quantized = is_data_type_quantized(input->data_type());
@@ -237,9 +237,16 @@ void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &
build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(conv_info.dilation.y()));
build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
- build_opts.add_option("-DM0_A=" + support::cpp11::to_string(weights->info()->dimension(1) + m0 - 1));
+ build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_input->info()->dimension(0) % n0));
build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+
+ // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
+ set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+ static_cast<int>(_weights->info()->dimension(1)),
+ static_cast<int>(_weights->info()->dimension(2))
+ });
+
if(biases != nullptr)
{
build_opts.add_option(std::string("-DHAS_BIAS"));
diff --git a/tests/datasets/DepthwiseConvolutionLayerDataset.h b/tests/datasets/DepthwiseConvolutionLayerDataset.h
index 3b17910eac..82ea40ff52 100644
--- a/tests/datasets/DepthwiseConvolutionLayerDataset.h
+++ b/tests/datasets/DepthwiseConvolutionLayerDataset.h
@@ -155,6 +155,16 @@ public:
}
};
+/** Dataset containing large kernel size for generic depthwise convolution. */
+class LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset final : public DepthwiseConvolutionLayerDataset
+{
+public:
+ LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset()
+ {
+ add_config(TensorShape(6U, 210U, 8U), Size2D(4U, 194U), PadStrideInfo(1, 1, 0, 0));
+ }
+};
+
/** Dataset containing small, 3x3 depthwise convolution shapes. */
class SmallDepthwiseConvolutionLayerDataset3x3 final : public DepthwiseConvolutionLayerDataset
{
diff --git a/tests/validation/CL/DepthwiseConvolutionLayer.cpp b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
index 79a2678b44..b2cff2b792 100644
--- a/tests/validation/CL/DepthwiseConvolutionLayer.cpp
+++ b/tests/validation/CL/DepthwiseConvolutionLayer.cpp
@@ -382,6 +382,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>,
{
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+
TEST_SUITE(Dilation)
FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
@@ -418,6 +419,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>,
{
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+
FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeDepthwiseConvolutionLayerDataset(),
large_depth_multipliers),
framework::dataset::make("DataType",
@@ -428,6 +430,17 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerFixture<float>,
validate(CLAccessor(_target), _reference, tolerance_f32);
}
+FIXTURE_DATA_TEST_CASE_NEW(RunLargeKernelSize, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(datasets::LargeKernelSizeDepthwiseConvolutionLayerNHWCDataset(),
+ framework::dataset::make("DepthMultiplier", { 1 })),
+ framework::dataset::make("DataType",
+ DataType::F32)),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ ActivationFunctionsDataset))
+{
+ validate(CLAccessor(_target), _reference, tolerance_f32);
+}
+
TEST_SUITE(Dilation)
FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallDepthwiseDilatedConvolutionLayerDataset(),
depth_multipliers),
diff --git a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
index 0e02ae28ca..ddbab7fe13 100644
--- a/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/DepthwiseConvolutionLayerFixture.h
@@ -113,6 +113,13 @@ public:
target_to_use = &_target;
}
+ add_padding_x({ &_src, &_biases }, _data_layout);
+ add_padding_x({ &_weights }, _data_layout, true);
+ if(!_in_place)
+ {
+ add_padding_x({ &_target }, _data_layout);
+ }
+
// Create Depthwise Convolution configure function
_dwc.configure(&_src, &_weights, &_biases, target_to_use, _pad_stride_info, _depth_multiplier, _act_info, _dilation);
@@ -124,12 +131,6 @@ public:
void allocate_and_run_target()
{
- add_padding_x({ &_src, &_weights, &_biases }, _data_layout);
- if(!_in_place)
- {
- add_padding_x({ &_target }, _data_layout);
- }
-
// Allocate tensors
_src.allocator()->allocate();
_weights.allocator()->allocate();
@@ -317,6 +318,10 @@ public:
_biases = create_tensor<TensorType>(_biases_shape, _data_type, 1, QuantizationInfo(), _data_layout);
_target = create_tensor<TensorType>(TensorShape(), _data_type, 1, QuantizationInfo(), _data_layout);
+ add_padding_x({ &_src, &_biases, &_target }, _data_layout);
+ add_padding_x({ &_weights }, _data_layout, true);
+ add_padding_y({ &_src, &_target }, _data_layout);
+
// Create Depthwise Convolution configure function
const ConvolutionInfo info
{
@@ -332,9 +337,6 @@ public:
void allocate_and_run_target()
{
- add_padding_x({ &_src, &_weights, &_biases, &_target }, _data_layout);
- add_padding_y({ &_src, &_target }, _data_layout);
-
// Allocate tensors
_src.allocator()->allocate();
_weights.allocator()->allocate();
@@ -482,6 +484,9 @@ public:
_conv_info, _depth_multiplier, _act_info, _dilation
};
+ add_padding_x({ &_src, &_biases, &_target }, _data_layout);
+ add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used
+
// Create Depthwise Convolution configure function
_dwc.configure(&_src, &_weights, &_biases, target_to_use, dwc_info, conv_kernel_info);
@@ -493,9 +498,6 @@ public:
void allocate_and_run_target()
{
- add_padding_x({ &_src, &_biases, &_target }, _data_layout);
- add_padding_x({ &_weights }, _data_layout, _export_to_cl_image); // Don't add left padding if cl image will be used
-
// Allocate tensors
_src.allocator()->allocate();
_weights.allocator()->allocate();