From 211a55d8218764c0a20d69d4cbdaea1906291c6b Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Wed, 31 Aug 2022 11:47:08 +0100 Subject: Optimize depthwise convolution on OpenCL The optimization concerns the case where the depth multiplier is > 1. The depth multiplier for loop has been removed from the OpenCL kernel and the GWS has been mapped to the output shape. In this way, we can still perform a tile with N0 columns and improve the performance of depthwise conv over 80% when depth multiplier is > 1. Resolves COMPMID-5568 Change-Id: I604e287d4eeb31c54b9cc6c3072a698cd0e3e136 Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8184 Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir Benchmark: Arm Jenkins --- .../CL/DepthwiseConvolutionLayerNative.cpp | 123 ++++++++++++++++++++- 1 file changed, 118 insertions(+), 5 deletions(-) (limited to 'tests/validation/CL/DepthwiseConvolutionLayerNative.cpp') diff --git a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp index f565255719..45047cab44 100644 --- a/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp +++ b/tests/validation/CL/DepthwiseConvolutionLayerNative.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -242,6 +242,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( @@ -347,6 +349,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( - width_values_precommit, + framework::dataset::make("width", { 33U } ), height_values_precommit), channel_values_precommit), batch_values_precommit), @@ -376,7 +379,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( - width_values_nightly, + framework::dataset::make("width", { 53U } ), height_values_nightly), channel_values_nightly), batch_values_nightly), @@ -394,12 +397,67 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + framework::dataset::make("width", { 33U } ), + height_values_precommit), + channel_values_precommit), + batch_values_precommit), + kernel_sz_values_precommit), + framework::dataset::make("depth_multiplier", 2)), + dilation_values), + stride_values), + padding_valid_values), + framework::dataset::make("DataType", DataType::F32)), + data_layout_values), + act_values), + framework::dataset::make("N0", {2})), + framework::dataset::make("ExportToCLImage", false))) +{ + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); +} + +TEST_SUITE(ExportWeightsToCLImage) +FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + framework::dataset::make("width", { 33U } ), + height_values_precommit), + channel_values_precommit), + batch_values_precommit), + kernel_sz_values_precommit), + framework::dataset::make("depth_multiplier", 4)), + dilation_values), + stride_values), + padding_valid_values), + framework::dataset::make("DataType", DataType::F32)), + data_layout_values), + act_values), + framework::dataset::make("N0", {4})), + framework::dataset::make("ExportToCLImage", true))) +{ + // Validate output + if(_validate_output) + { + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} +TEST_SUITE_END() // ExportWeightsToCLImage +TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels TEST_SUITE_END() // FP32 TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( - width_values_precommit, + framework::dataset::make("width", { 33U } ), height_values_precommit), channel_values_precommit), batch_values_precommit), @@ -420,7 +478,7 @@ FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( - width_values_nightly, + framework::dataset::make("width", { 53U } ), height_values_nightly), channel_values_nightly), batch_values_nightly), @@ -438,6 +496,61 @@ FIXTURE_DATA_TEST_CASE_NEW(RunLarge, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + framework::dataset::make("width", { 33U } ), + height_values_precommit), + channel_values_precommit), + batch_values_precommit), + kernel_sz_values_precommit), + framework::dataset::make("depth_multiplier", 2)), + dilation_values), + stride_values), + padding_valid_values), + framework::dataset::make("DataType", DataType::F16)), + data_layout_values), + act_values), + framework::dataset::make("N0", {2})), + framework::dataset::make("ExportToCLImage", false))) +{ + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16); +} + +TEST_SUITE(ExportWeightsToCLImage) +FIXTURE_DATA_TEST_CASE_NEW(RunSmall, CLDepthwiseConvolutionLayerNativeFixture, framework::DatasetMode::ALL, + combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine( + framework::dataset::make("width", { 33U } ), + height_values_precommit), + channel_values_precommit), + batch_values_precommit), + kernel_sz_values_precommit), + framework::dataset::make("depth_multiplier", 4)), + dilation_values), + stride_values), + padding_valid_values), + framework::dataset::make("DataType", DataType::F16)), + data_layout_values), + act_values), + framework::dataset::make("N0", {4})), + framework::dataset::make("ExportToCLImage", true))) +{ + // Validate output + if(_validate_output) + { + // Validate output + validate(CLAccessor(_target), _reference, rel_tolerance_f16, 0.f, abs_tolerance_f16); + } + else + { + ARM_COMPUTE_TEST_INFO("cl_khr_image2d_from_buffer not supported. TEST skipped"); + framework::ARM_COMPUTE_PRINT_INFO(); + } +} +TEST_SUITE_END() // ExportWeightsToCLImage +TEST_SUITE_END() // DepthMultiplierMultipleOfOutputChannels TEST_SUITE_END() // FP16 TEST_SUITE_END() // Float TEST_SUITE_END() // DepthMultiplier -- cgit v1.2.1