diff options
author | ramelg01 <ramy.elgammal@arm.com> | 2022-02-08 09:38:17 +0000 |
---|---|---|
committer | Ramy Elgammal <ramy.elgammal@arm.com> | 2022-02-10 10:40:51 +0000 |
commit | bb6877ad4542943e718ac48727f238600fb8257c (patch) | |
tree | d4e4de9ebf9db10648a7fe772821610d2d74de8f /src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp | |
parent | 6863fa061d34cb9d418872ff720c0e4ef4f8dbbb (diff) | |
download | ComputeLibrary-bb6877ad4542943e718ac48727f238600fb8257c.tar.gz |
Improve start-up time for winograd_output_transform_*_nhwc
- pass tensor's dimensions at runtime rather than compile time
- Add guard macro to compile only kernel of internest
Resolves: COMPMID-5120
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I87c3b56ce0cd3c97ffdeabdd9c5d433f361bb005
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7101
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp')
-rw-r--r-- | src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp | 60 |
1 files changed, 44 insertions, 16 deletions
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index a8cf8234ad..ff57c83959 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2021 Arm Limited. + * Copyright (c) 2018-2022 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -176,23 +176,47 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option("-DVEC_SIZE=4"); } - build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); - build_opts.add_option("-cl-fast-relaxed-math"); - build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); - build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width)); - build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); - build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); - build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width))); - build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); - build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); - build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); - build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); + if(_is_nhwc) + { + build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); + build_opts.add_option("-cl-fast-relaxed-math"); + build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); + } + else + { + build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); + build_opts.add_option("-cl-fast-relaxed-math"); + build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width)); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); + build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width))); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); + build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); + } + + // Storing tensor dimensions to be sent later as kernel arguments + _src_height = src->dimension(1); + _dst_width = dst->dimension(idx_width); + _dst_height = dst->dimension(idx_height); + _num_tiles_x = num_tiles.width; // Create kernel std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // A macro guard to compile ONLY the kernel of interest + build_opts.add_option("-D" + upper_string(kernel_name)); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Set config_id for enabling LWS tuning _config_id = kernel_name; @@ -251,7 +275,11 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window if(_is_nhwc) { unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0); - _kernel.setArg(idx2, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); + _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); + _kernel.setArg<cl_int>(idx2++, _src_height); + _kernel.setArg<cl_int>(idx2++, _dst_width); + _kernel.setArg<cl_int>(idx2++, _dst_height); + _kernel.setArg<cl_int>(idx2++, _num_tiles_x); } do |