aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
diff options
context:
space:
mode:
authorGian Marco Iodice <gianmarco.iodice@arm.com>2023-04-14 12:20:58 +0100
committerGian Marco Iodice <gianmarco.iodice@arm.com>2023-04-26 11:08:40 +0000
commit905a3c1a8883d988edf5bdc749844a4565fe5623 (patch)
tree2a9a98a572cac20ac161a8f8a2003c4bd7e7c6e3 /src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
parentb2758f35da97319fd15722485e9b4ba7b35c8cfa (diff)
downloadComputeLibrary-905a3c1a8883d988edf5bdc749844a4565fe5623.tar.gz
Improve Winograd performance on OpenCL
- Performs more output elements per work-item in the case of Fp16 computation in Winograd Input/Output transform Resolves COMPMID-6018 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Change-Id: If5e6f5182eff8c1f05a3505c437d0a997490f0bd Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9447 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp')
-rw-r--r--src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp22
1 files changed, 19 insertions, 3 deletions
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index 9eb249a66a..c5c24886bd 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2022 Arm Limited.
+ * Copyright (c) 2018-2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -102,7 +102,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
ARM_COMPUTE_UNUSED(bias);
- constexpr unsigned int num_elems_processed_per_iteration = 1;
+ unsigned int num_elems_processed_per_iteration = 1;
+
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ // In the case of FP16 computation, we can perform more
+ // output feature maps in a single work-item.
+ // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+ // improve the performance. However, in order to make the implementation simpler,
+ // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+ const DataType dt = input->data_type();
+ const size_t dim0 = input->dimension(0);
+ const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+ if(cond)
+ {
+ num_elems_processed_per_iteration = 2;
+ }
+ }
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
bool window_changed = false;
@@ -203,7 +219,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_
build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
- build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
+ build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));