From afa5d817b1d083837cd7ea30d32f845d82620c12 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Thu, 30 Nov 2017 14:25:57 +0000 Subject: COMPMID-617 Add validation methods to Kernels - NEActivationLayer - NESoftmax - NEDirectConvolutionLayer - NENormalizationLayer - NEPoolingLayer Change-Id: Ib279f1c1b7f9247679b0d6593aed7393da8fe87b Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111335 Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Georgios Pinitas --- .../kernels/NEDirectConvolutionLayerKernel.cpp | 249 +++++++++++++-------- 1 file changed, 155 insertions(+), 94 deletions(-) (limited to 'src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp') diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 78afbc2c20..1ca213b04a 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -251,10 +251,15 @@ inline qint32x4x2_t internal_vmlal(const qint32x4x2_t &x, const qint16x8_t &y, c return r; } -constexpr int SmallTensorSizeOptim = 8; +constexpr int small_tensor_size_optim = 8; +inline bool run_optim_small_tensor_info(const ITensorInfo *t) +{ + return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim; +} + inline bool run_optim_small_tensor(const ITensor *t) { - return t->info()->dimension(Window::DimX) <= SmallTensorSizeOptim && t->info()->dimension(Window::DimY) <= SmallTensorSizeOptim; + return run_optim_small_tensor_info(t->info()); } // Optimized convolver for 1x1 kernels used only where input width and height are both <= 8 @@ -266,8 +271,8 @@ class convolver_w1x1_i8x8_f32 public: static void convolve(const Window &window, const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) { - ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > SmallTensorSizeOptim); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > SmallTensorSizeOptim); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimX) > small_tensor_size_optim); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(Window::DimY) > small_tensor_size_optim); const int input_stride_y = input->info()->strides_in_bytes().y(); const int input_stride_z = input->info()->strides_in_bytes().z(); @@ -302,12 +307,12 @@ public: execute_window_loop(window_out, [&](const Coordinates & id) { - const uint8_t *input_ptr = in.ptr(); - uint8_t *out_ptr = out.ptr(); - int ih = 0; - int oh = 0; - float32x4_t accum0[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; - float32x4_t accum1[SmallTensorSizeOptim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; + const uint8_t *input_ptr = in.ptr(); + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + float32x4_t accum0[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; + float32x4_t accum1[small_tensor_size_optim] = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) }; for(int oz = 0; oz < range_z; ++oz) { accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f); @@ -992,121 +997,118 @@ inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_i } } -} // namespace - -NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel() - : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0), - _num_elems_written_per_iteration(0) -{ -} - -BorderSize NEDirectConvolutionLayerKernel::border_size() const +inline TensorShape get_convolved_dimensions(const ITensorInfo *input, const ITensorInfo *weights, const int kernel_size, const PadStrideInfo &conv_info) { - return _border_size; -} - -void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())), - "Pad > 0 not supported for 1x1 weights"); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1), - "Pad > 1 not supported for 3x3 weights"); - ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2), - "Pad > 2 not supported for 5x5 weights"); - - ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1)); - ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); - - const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); - const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); - const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); - - _input = input; - _weights = weights; - _output = output; - _conv_info = conv_info; - _kernel_size = weights->info()->dimension(0); - _border_size = BorderSize(conv_pad_y, conv_pad_x); - - const unsigned int kernel_size = weights->info()->dimension(0); - - // Get convolved dimensions unsigned int output_width = 0; unsigned int output_height = 0; - std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info); + std::tie(output_width, output_height) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_size, kernel_size, conv_info); - TensorShape output_shape = input->info()->tensor_shape(); + TensorShape output_shape = input->tensor_shape(); output_shape.set(0, output_width); output_shape.set(1, output_height); - output_shape.set(2, weights->info()->dimension(3)); + output_shape.set(2, weights->dimension(3)); - DataType data_type = input->info()->data_type(); + return output_shape; +} - if(is_data_type_fixed_point(data_type)) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())), + "Pad > 0 not supported for 1x1 weights"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1), + "Pad > 1 not supported for 3x3 weights"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2), + "Pad > 2 not supported for 5x5 weights"); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + + // Checks performed when output is configured + if(output->total_size() != 0) { - // Promote data type in case of fixed point - data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + TensorShape output_shape = get_convolved_dimensions(input, weights, weights->dimension(0), conv_info); + + DataType data_type = input->data_type(); + if(is_data_type_fixed_point(data_type)) + { + // Promote data type in case of fixed point + data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type); } - // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position()); + return Status{}; +} - ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, output->info()->data_type()); +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row, + unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration) +{ + // Calculate right and bottom border + unsigned int kernel_size = weights->dimension(0); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + BorderSize border_size = BorderSize(conv_pad_y, conv_pad_x); + const int input_width = input->dimension(0); + const int input_height = input->dimension(1); - switch(_kernel_size) + switch(kernel_size) { case 1: { - switch(input->info()->data_type()) + switch(input->data_type()) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QS8: case DataType::QS16: - _num_elems_written_per_iteration = 8; + num_elems_written_per_iteration = 8; break; case DataType::F32: - if(run_optim_small_tensor(input)) + if(run_optim_small_tensor_info(input)) { - _num_elems_written_per_iteration = 8; + num_elems_written_per_iteration = 8; } else { - _num_elems_written_per_iteration = 4; + num_elems_written_per_iteration = 4; } break; default: ARM_COMPUTE_ERROR("Data type not supported."); break; } - _num_weight_elems_read_per_row = kernel_size; - _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration; + num_weight_elems_read_per_row = kernel_size; + num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration; break; } case 3: case 5: { - switch(input->info()->data_type()) + switch(input->data_type()) { case DataType::F32: - _num_weight_elems_read_per_row = 4 + _kernel_size - 1; - _num_elems_read_per_iteration = 12; - _num_elems_written_per_iteration = 16 >> conv_stride_x; + num_weight_elems_read_per_row = 4 + kernel_size - 1; + num_elems_read_per_iteration = 12; + num_elems_written_per_iteration = 16 >> conv_stride_x; break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ case DataType::QS8: case DataType::QS16: - _num_weight_elems_read_per_row = 8 + _kernel_size - 1; - _num_elems_read_per_iteration = 24; - _num_elems_written_per_iteration = 32 >> conv_stride_x; + num_weight_elems_read_per_row = 8 + kernel_size - 1; + num_elems_read_per_iteration = 24; + num_elems_written_per_iteration = 32 >> conv_stride_x; break; default: ARM_COMPUTE_ERROR("Data type not supported."); @@ -1121,22 +1123,81 @@ void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITens } } - // Calculate right and bottom border - const unsigned int conv_stride_y = std::get<1>(_conv_info.stride()); - const int input_width = input->info()->dimension(0); - const int input_height = input->info()->dimension(1); - const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width; - const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height; - _border_size.right = std::max(upper_bound_w, static_cast(_kernel_size)); - _border_size.bottom = std::max(upper_bound_h, static_cast(_kernel_size)); - Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration)); - AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); - AccessWindowStatic weights_access(weights->info(), 0, 0, _num_weight_elems_read_per_row, _kernel_size); - AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration); - update_window_and_padding(win, input_access, weights_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); - - INEKernel::configure(win); + const int upper_bound_w = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_pad_x - input_width; + const int upper_bound_h = ((output->dimension(1) - 1) * conv_stride_y - conv_pad_y + kernel_size) - input_height; + border_size.right = std::max(upper_bound_w, static_cast(kernel_size)); + border_size.bottom = std::max(upper_bound_h, static_cast(kernel_size)); + Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration)); + AccessWindowStatic input_access(input, -conv_pad_x, -conv_pad_y, input_width + border_size.right, input_height + border_size.bottom); + AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size); + AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration); + bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel() + : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0), + _num_elems_written_per_iteration(0) +{ +} + +BorderSize NEDirectConvolutionLayerKernel::border_size() const +{ + return _border_size; +} + +void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + + _input = input; + _weights = weights; + _output = output; + _conv_info = conv_info; + _kernel_size = weights->info()->dimension(0); + _border_size = BorderSize(conv_pad_y, conv_pad_x); + + // Get convolved dimensions + TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info); + + DataType data_type = input->info()->data_type(); + + if(is_data_type_fixed_point(data_type)) + { + // Promote data type in case of fixed point + data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32); + } + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info)); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row, + _num_elems_read_per_iteration, _num_elems_written_per_iteration); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + INEKernel::configure(win_config.second); +} + +Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info) +{ + unsigned int num_weight_elems_read_per_row = 0; + unsigned int num_elems_read_per_iteration = 0; + unsigned int num_elems_written_per_iteration = 0; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, num_weight_elems_read_per_row, num_elems_read_per_iteration, + num_elems_written_per_iteration) + .first); + + return Status{}; } void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info) -- cgit v1.2.1