diff options
Diffstat (limited to 'src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp')
-rw-r--r-- | src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp | 63 |
1 files changed, 38 insertions, 25 deletions
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp index fab6c36032..54c48986fc 100644 --- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -55,17 +56,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd input transform only supports unit strides"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), + "Winograd input transform not supported"); ARM_COMPUTE_UNUSED(conv_info); ARM_COMPUTE_UNUSED(output_tile_size); ARM_COMPUTE_UNUSED(kernel_size); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); + const TensorShape output_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -74,7 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) { ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -82,7 +88,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; int num_elems_processed_per_iteration = 1; - if(input->data_layout() == DataLayout::NHWC) + if (input->data_layout() == DataLayout::NHWC) { // In the case of FP16 computation, we can perform more // output feature maps in a single work-item. @@ -94,9 +100,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const size_t dim0 = input->dimension(0); const size_t k_sz = winograd_info.kernel_size.area(); const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); - if(cond) + if (cond) { - if(k_sz == 3 || k_sz == 9) + if (k_sz == 3 || k_sz == 9) { num_elems_processed_per_iteration = 2; } @@ -104,7 +110,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - if(input->data_layout() == DataLayout::NCHW) + if (input->data_layout() == DataLayout::NCHW) { const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -113,11 +119,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1; unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1; - AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); + AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), + num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); window_changed = update_window_and_padding(win, input_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -132,12 +140,15 @@ BorderSize ClWinogradInputTransformKernel::border_size() const return _border_size; } -void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -150,14 +161,13 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c // Compute the number of output tiles along the x and y direction of size "output_tile_size" const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)), - kernel_size, - output_tile_size, - conv_info); + kernel_size, output_tile_size, conv_info); _num_tiles_x = num_tiles.width; _num_tiles_y = num_tiles.height; - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorShape output_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); // Output auto initialization if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape)); @@ -174,7 +184,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c _src_height = src->dimension(idx_h); CLBuildOptions build_opts; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { build_opts.add_option("-DNHWC"); build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); @@ -201,13 +211,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c } // Create kernel - std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); + std::string kernel_name = + "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); // Get the maximum dimension from the tile size const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); // Check optimized kernel if output_dims == 2x2 - if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) + if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) { _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2; } @@ -239,11 +250,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c _config_id += lower_string(string_from_data_layout(_data_layout)); } -Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); return Status{}; } @@ -263,7 +277,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window & // Collapse window Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { Window slice = window_collapsed.first_slice_window_3D(); slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1)); @@ -298,8 +312,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window & add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } } // namespace kernels |