From 905a3c1a8883d988edf5bdc749844a4565fe5623 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Fri, 14 Apr 2023 12:20:58 +0100 Subject: Improve Winograd performance on OpenCL - Performs more output elements per work-item in the case of Fp16 computation in Winograd Input/Output transform Resolves COMPMID-6018 Signed-off-by: Gian Marco Iodice Change-Id: If5e6f5182eff8c1f05a3505c437d0a997490f0bd Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9447 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Jakub Sujak Reviewed-by: Viet-Hoa Do Benchmark: Arm Jenkins --- .../cl/kernels/ClWinogradInputTransformKernel.cpp | 45 +++++++++++++++++----- .../cl/kernels/ClWinogradOutputTransformKernel.cpp | 22 +++++++++-- 2 files changed, 54 insertions(+), 13 deletions(-) (limited to 'src/gpu') diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp index d6b038f0f8..48d806dc7c 100644 --- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -79,8 +79,30 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - bool window_changed = false; - Window win = calculate_max_window(*input, Steps(1, 1)); + bool window_changed = false; + int num_elems_processed_per_iteration = 1; + + if(input->data_layout() == DataLayout::NHWC) + { + // In the case of FP16 computation, we can perform more + // output feature maps in a single work-item. + // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to + // improve the performance. However, in order to make the implementation simpler, + // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2. + // Note: At the moment, only Winograd Input Transform 3x3 can support N0 != 1 + const DataType dt = input->data_type(); + const size_t dim0 = input->dimension(0); + const size_t k_sz = winograd_info.kernel_size.area(); + const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); + if(cond) + { + if(k_sz == 3 || k_sz == 9) + { + num_elems_processed_per_iteration = 2; + } + } + } + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); if(input->data_layout() == DataLayout::NCHW) { @@ -143,12 +165,19 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast(dst->dimension(1))); const size_t total_batches = src->tensor_shape().total_size_upper(3); + // Create window and update padding + auto win_config = validate_and_configure_window(src, dst, winograd_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8)); + + _src_width = src->dimension(idx_w); + _src_height = src->dimension(idx_h); + CLBuildOptions build_opts; if(_data_layout == DataLayout::NHWC) { build_opts.add_option("-DNHWC"); - _src_width = src->dimension(idx_w); - _src_height = src->dimension(idx_h); + build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); @@ -156,6 +185,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); + build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED"); } else { @@ -191,11 +221,6 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c build_opts.add_option("-D" + upper_string(kernel_name)); _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); - // Create window and update padding - auto win_config = validate_and_configure_window(src, dst, winograd_info); - ARM_COMPUTE_ERROR_THROW_ON(win_config.first); - IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8)); - _border_size = BorderSize(src->padding()); ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info)); diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index 9eb249a66a..c5c24886bd 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2022 Arm Limited. + * Copyright (c) 2018-2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -102,7 +102,23 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(bias); - constexpr unsigned int num_elems_processed_per_iteration = 1; + unsigned int num_elems_processed_per_iteration = 1; + + if(input->data_layout() == DataLayout::NHWC) + { + // In the case of FP16 computation, we can perform more + // output feature maps in a single work-item. + // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to + // improve the performance. However, in order to make the implementation simpler, + // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2. + const DataType dt = input->data_type(); + const size_t dim0 = input->dimension(0); + const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); + if(cond) + { + num_elems_processed_per_iteration = 2; + } + } Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; @@ -203,7 +219,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type)); - build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED"); build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); -- cgit v1.2.1