diff options
author | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2023-04-14 12:20:58 +0100 |
---|---|---|
committer | Gian Marco Iodice <gianmarco.iodice@arm.com> | 2023-04-26 11:08:40 +0000 |
commit | 905a3c1a8883d988edf5bdc749844a4565fe5623 (patch) | |
tree | 2a9a98a572cac20ac161a8f8a2003c4bd7e7c6e3 /src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp | |
parent | b2758f35da97319fd15722485e9b4ba7b35c8cfa (diff) | |
download | ComputeLibrary-905a3c1a8883d988edf5bdc749844a4565fe5623.tar.gz |
Improve Winograd performance on OpenCL
- Performs more output elements per work-item in the case of Fp16
computation in Winograd Input/Output transform
Resolves COMPMID-6018
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Change-Id: If5e6f5182eff8c1f05a3505c437d0a997490f0bd
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9447
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp')
-rw-r--r-- | src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp | 22 |
1 files changed, 19 insertions, 3 deletions
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index 9eb249a66a..c5c24886bd 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -102,7 +102,23 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(bias); - constexpr unsigned int num_elems_processed_per_iteration = 1; + unsigned int num_elems_processed_per_iteration = 1; + + if(input->data_layout() == DataLayout::NHWC) + { + // In the case of FP16 computation, we can perform more + // output feature maps in a single work-item. + // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to + // improve the performance. However, in order to make the implementation simpler, + // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2. + const DataType dt = input->data_type(); + const size_t dim0 = input->dimension(0); + const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); + if(cond) + { + num_elems_processed_per_iteration = 2; + } + } Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; @@ -203,7 +219,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type)); - build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED"); build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); |