From cb0010b02281245c66d5c996fa9ef8b22f036a2d Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Fri, 11 May 2018 16:23:53 +0100 Subject: COMPMID-1102 : Enable the use of 4x4 tile sizes in neon implementation of winograd conv. Change-Id: Ibd2f2c6680b647a066255ea77d4a2a172ef76aa3 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/130418 Reviewed-by: Gian Marco Iodice Tested-by: Jenkins --- .../NEON/functions/NEWinogradConvolutionLayer.cpp | 146 ++++++++++++--------- 1 file changed, 87 insertions(+), 59 deletions(-) (limited to 'src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp') diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp index f4640fb0b6..d745f42f1a 100644 --- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp @@ -36,45 +36,42 @@ #include "arm_compute/core/NEON/kernels/convolution/winograd/winograd_gemm.hpp" +namespace arm_compute +{ namespace { inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input) { - const int in_width = input->info()->dimension(0); - const int in_height = input->info()->dimension(1); - const int in_batches = input->info()->dimension(3); - const int in_channels = input->info()->dimension(2); + const DataLayout data_layout = input->info()->data_layout(); + const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH)); + const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT)); + const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL)); + const int in_batches = input->info()->dimension(3); + return Tensor4DShape({ in_batches, in_height, in_width, in_channels }); } -} /* namespace */ -namespace arm_compute -{ -namespace -{ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info) { - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights); - ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output); + const DataLayout data_layout = input->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != 3 && weights->dimension(0) != 5, "Only 3 and 5 kernels are supported"); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW); // COMPMID-1162 + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 3 && weights->dimension(height_idx) != 5, "Only 3 and 5 kernels are supported"); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides."); + if(biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); } - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides."); - - ARM_COMPUTE_UNUSED(output); return Status{}; } } //namespace @@ -89,36 +86,51 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptrinfo(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info)); _weights = weights; _input = input; _output = output; + // Get indices for the width and height + const DataLayout data_layout = input->info()->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + std::unique_ptr> transform_input_kernel; std::unique_ptr> transform_weights_kernel; std::unique_ptr> transform_output_kernel; - const int weights_width = weights->info()->dimension(0); - const int weights_height = weights->info()->dimension(1); + const int weights_width = weights->info()->dimension(width_idx); + const int weights_height = weights->info()->dimension(height_idx); - int output_tile_rows = 0; - int output_tile_cols = 0; - int n_gemms = 0; - int N_BLOCK = 0; // Size of block used by GEMM. + Size2D output_tile{}; + int n_gemms = 0; + int N_BLOCK = 0; // Size of block used by GEMM. switch(weights_width) { case 3: { - transform_input_kernel = support::cpp14::make_unique>(); - transform_weights_kernel = support::cpp14::make_unique>(); - transform_output_kernel = support::cpp14::make_unique>(); - output_tile_rows = 2; - output_tile_cols = 2; - n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; - N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4) + { + transform_input_kernel = support::cpp14::make_unique>(); + transform_weights_kernel = support::cpp14::make_unique>(); + transform_output_kernel = support::cpp14::make_unique>(); + output_tile = Size2D(4U, 4U); + n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; + N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + } + else + { + transform_input_kernel = support::cpp14::make_unique>(); + transform_weights_kernel = support::cpp14::make_unique>(); + transform_output_kernel = support::cpp14::make_unique>(); + output_tile = Size2D(2U, 2U); + n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; + N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; + } break; } case 5: @@ -126,8 +138,7 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * transform_input_kernel = support::cpp14::make_unique>(); transform_weights_kernel = support::cpp14::make_unique>(); transform_output_kernel = support::cpp14::make_unique>(); - output_tile_rows = 2; - output_tile_cols = 2; + output_tile = Size2D(2U, 2U); n_gemms = NEWinogradLayerBatchedGEMMKernel::WinogradBase::N_GEMMS; N_BLOCK = NEWinogradLayerBatchedGEMMKernel::WinogradConv::N_BLOCK; break; @@ -142,15 +153,9 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * const PaddingType use_padding_type = (conv_info.pad_left() != 0u) ? PADDING_SAME : PADDING_VALID; const bool use_same_padding = use_padding_type == PADDING_SAME; - // Get parameters from conv_info - unsigned int stride_x = 0; - unsigned int stride_y = 0; - std::tie(stride_x, stride_y) = conv_info.stride(); - ARM_COMPUTE_ERROR_ON_MSG(stride_y != 1 || stride_x != 1, "Winograd layer only supports unit strides."); - // Get convolved dimensions - const int in_channels = input->info()->dimension(2); - const int out_channels = output->info()->dimension(2); + const int in_channels = input->info()->dimension(channel_idx); + const int out_channels = output->info()->dimension(channel_idx); const Tensor4DShape in_shape(internal_get_input_shape(input)); const size_t data_type_size = input->info()->element_size(); @@ -205,8 +210,8 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor * in_shape.n_batches, output_shape.n_rows, output_shape.n_cols, out_channels); // Configure GEMM - const int tile_rows = iceildiv(output_shape.n_rows, output_tile_rows); - const int tile_cols = iceildiv(output_shape.n_cols, output_tile_cols); + const int tile_rows = iceildiv(output_shape.n_rows, output_tile.height); + const int tile_cols = iceildiv(output_shape.n_cols, output_tile.width); const int m = in_shape.n_batches * tile_rows * tile_cols; const int k = in_shape.n_channels; const int n = out_channels; @@ -289,19 +294,24 @@ void NEWinogradConvolutionLayer::run() Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info)); // Get indices for the width and height const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); // Input shape - const TensorShape input_shape = input->tensor_shape(); + const TensorShape input_shape = input->tensor_shape(); + const unsigned int input_w = input_shape[idx_width]; + const unsigned int input_h = input_shape[idx_height]; // Kernel size const unsigned int kernel_w = weights->tensor_shape()[idx_width]; const unsigned int kernel_h = weights->tensor_shape()[idx_height]; - const WinogradInfo winograd_info = WinogradInfo(Size2D(2, 2), + const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w > 4 && input_h > 4) ? Size2D(4U, 4U) : Size2D(2U, 2U); + + const WinogradInfo winograd_info = WinogradInfo(output_tile, Size2D(kernel_w, kernel_h), Size2D(input_shape[idx_width], input_shape[idx_height]), conv_info, @@ -310,11 +320,18 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen // Validate input transform const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape); - switch(weights->dimension(0)) + switch(weights->dimension(idx_width)) { case 3: { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel::validate(input, &input0, winograd_info))); + if(input_w > 4 && input_h > 4) + { + ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel::validate(input, &input0, winograd_info))); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel::validate(input, &input0, winograd_info))); + } break; } case 5: @@ -332,11 +349,18 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); - switch(weights->dimension(0)) + switch(weights->dimension(idx_width)) { case 3: { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel::validate(weights, &input1, winograd_info))); + if(input_w > 4 && input_h > 4) + { + ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel::validate(weights, &input1, winograd_info))); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel::validate(weights, &input1, winograd_info))); + } break; } case 5: @@ -354,20 +378,24 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen TensorShape batched_mm_output_shape = input0.tensor_shape(); batched_mm_output_shape[0] = input1.tensor_shape()[0]; const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - switch(weights->dimension(0)) + switch(weights->dimension(idx_width)) { case 3: { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerBatchedGEMMKernel::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, - true /* Reshape weights only for the first run*/)))); - // Validate output transform - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel::validate(&batched_mm_output, biases, output, winograd_info))); + if(input_w > 4 && input_h > 4) + { + // Validate output transform + ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel::validate(&batched_mm_output, biases, output, winograd_info))); + } + else + { + // Validate output transform + ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel::validate(&batched_mm_output, biases, output, winograd_info))); + } break; } case 5: { - ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerBatchedGEMMKernel::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, - true /* Reshape weights only for the first run*/)))); // Validate output transform ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel::validate(&batched_mm_output, biases, output, winograd_info))); break; -- cgit v1.2.1