diff options
Diffstat (limited to 'src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp')
-rw-r--r-- | src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp | 110 |
1 files changed, 68 insertions, 42 deletions
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index bf974d30d8..89c80c55ef 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -23,7 +23,6 @@ */ #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -31,10 +30,12 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -54,7 +55,11 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); @@ -66,30 +71,32 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; const Size2D input_dimensions = winograd_info.input_dimensions; - const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); + const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * + (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), + "Winograd output transform not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels"); // Compute number of elements to process in the X and Y direction // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); + const Size2D num_tiles = + compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area()))); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -98,14 +105,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *bias, + ITensorInfo *output, + const Size2D &output_tile_size) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(bias); unsigned int num_elems_processed_per_iteration = 1; - if(input->data_layout() == DataLayout::NHWC) + if (input->data_layout() == DataLayout::NHWC) { // In the case of FP16 computation, we can perform more // output feature maps in a single work-item. @@ -115,7 +125,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const DataType dt = input->data_type(); const size_t dim0 = input->dimension(0); const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); - if(cond) + if (cond) { num_elems_processed_per_iteration = 2; } @@ -124,17 +134,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; - if(output->data_layout() == DataLayout::NCHW) + if (output->data_layout() == DataLayout::NCHW) { const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width); const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, + num_elems_processed_per_iteration); AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y); window_changed = update_window_and_padding(win, input_access, output_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -144,13 +156,18 @@ ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel() _type = CLKernelType::WINOGRAD; } -void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, +void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); + auto_init_if_empty(*dst, + src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info)); @@ -159,7 +176,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC; @@ -168,14 +185,13 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; const PadStrideInfo conv_info = winograd_info.convolution_info; - const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); + const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); + const Size2D num_tiles = + compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info); const size_t total_batches = dst->tensor_shape().total_size_upper(3); // Set build options @@ -184,11 +200,11 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) + if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) { build_opts.add_option("-DVEC_SIZE=2"); } - else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) + else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) { build_opts.add_option("-DVEC_SIZE=4"); } @@ -200,9 +216,10 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ const auto act_function = act_info.activation(); const auto src_data_type = src->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (src_data_type == DataType::F32 || src_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (src_data_type == DataType::F32 || src_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -213,7 +230,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option("-cl-fast-relaxed-math"); } - if(_is_nhwc) + if (_is_nhwc) { build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); @@ -247,7 +264,9 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ _dst_height = dst->dimension(idx_height); // Create kernel - std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); + std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + + kernel_size.to_string() + "_" + + lower_string(string_from_data_layout(winograd_info.output_data_layout)); // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -271,10 +290,18 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc); } -Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), + (bias != nullptr ? bias->clone().get() : nullptr), + dst->clone().get(), winograd_info.output_tile_size) + .first); return Status{}; } @@ -299,7 +326,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(bias != nullptr) + if (bias != nullptr) { unsigned int idx1 = 2 * num_arguments_per_4D_tensor(); Window slice_biases; @@ -307,7 +334,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window add_1D_tensor_argument(idx1, bias, slice_biases); } - if(_is_nhwc) + if (_is_nhwc) { unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0); _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); @@ -322,8 +349,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); } } // namespace kernels } // namespace opencl |