aboutsummaryrefslogtreecommitdiff
path: root/src/gpu
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2023-04-14 12:20:58 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2023-04-26 11:08:40 +0000
commit905a3c1a8883d988edf5bdc749844a4565fe5623 (patch)
tree2a9a98a572cac20ac161a8f8a2003c4bd7e7c6e3 /src/gpu
parentb2758f35da97319fd15722485e9b4ba7b35c8cfa (diff)
downloadComputeLibrary-905a3c1a8883d988edf5bdc749844a4565fe5623.tar.gz
Improve Winograd performance on OpenCL
- Performs more output elements per work-item in the case of Fp16 computation in Winograd Input/Output transform Resolves COMPMID-6018 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Change-Id: If5e6f5182eff8c1f05a3505c437d0a997490f0bd Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9447 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu')
-rw-r--r--src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp45
-rw-r--r--src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp22
2 files changed, 54 insertions, 13 deletions
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index d6b038f0f8..48d806dc7c 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -79,8 +79,30 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
- bool window_changed = false;
- Window win = calculate_max_window(*input, Steps(1, 1));
+ bool window_changed = false;
+ int num_elems_processed_per_iteration = 1;
+
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ // In the case of FP16 computation, we can perform more
+ // output feature maps in a single work-item.
+ // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+ // improve the performance. However, in order to make the implementation simpler,
+ // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+ // Note: At the moment, only Winograd Input Transform 3x3 can support N0 != 1
+ const DataType dt = input->data_type();
+ const size_t dim0 = input->dimension(0);
+ const size_t k_sz = winograd_info.kernel_size.area();
+ const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+ if(cond)
+ {
+ if(k_sz == 3 || k_sz == 9)
+ {
+ num_elems_processed_per_iteration = 2;
+ }
+ }
+ }
+ Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
if(input->data_layout() == DataLayout::NCHW)
{
@@ -143,12 +165,19 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(dst->dimension(1)));
const size_t total_batches = src->tensor_shape().total_size_upper(3);
+ // Create window and update padding
+ auto win_config = validate_and_configure_window(src, dst, winograd_info);
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
+
+ _src_width = src->dimension(idx_w);
+ _src_height = src->dimension(idx_h);
+
CLBuildOptions build_opts;
if(_data_layout == DataLayout::NHWC)
{
build_opts.add_option("-DNHWC");
- _src_width = src->dimension(idx_w);
- _src_height = src->dimension(idx_h);
+ build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
@@ -156,6 +185,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
+ build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
}
else
{
@@ -191,11 +221,6 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c
build_opts.add_option("-D" + upper_string(kernel_name));
_kernel = create_kernel(compile_context, kernel_name, build_opts.options());
- // Create window and update padding
- auto win_config = validate_and_configure_window(src, dst, winograd_info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
-
_border_size = BorderSize(src->padding());
ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index 9eb249a66a..c5c24886bd 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -102,7 +102,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_UNUSED(bias);
- constexpr unsigned int num_elems_processed_per_iteration = 1;
+ unsigned int num_elems_processed_per_iteration = 1;
+
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ // In the case of FP16 computation, we can perform more
+ // output feature maps in a single work-item.
+ // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+ // improve the performance. However, in order to make the implementation simpler,
+ // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+ const DataType dt = input->data_type();
+ const size_t dim0 = input->dimension(0);
+ const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+ if(cond)
+ {
+ num_elems_processed_per_iteration = 2;
+ }
+ }
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
bool window_changed = false;
@@ -203,7 +219,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
- build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
+ build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));