From 041f36d4dc1b6473d9f7136659a384d611fab0b6 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 18 Sep 2018 18:38:37 +0100 Subject: COMPMID-1446 : Add support for 3D output in NEGEMMLowpOutputStage Change-Id: I61e7d39d09a9936b1128ec04038fa2d8dfe6a2c8 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/149211 Reviewed-by: Isabella Gottardi Reviewed-by: Anthony Barbier Tested-by: bsgcomp --- ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp | 99 ++++++++++++++-------- .../NEON/functions/NEGEMMConvolutionLayer.cpp | 61 ++++++------- .../NEON/functions/NEGEMMLowpOutputStage.cpp | 10 +-- 3 files changed, 97 insertions(+), 73 deletions(-) (limited to 'src') diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp index 5e14e1a95d..0196bacdcf 100644 --- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include #include @@ -41,8 +42,10 @@ using namespace arm_compute; namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, + int min, int max, unsigned int gemm_3d_depth) { + ARM_COMPUTE_UNUSED(gemm_3d_depth); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(max > 255); ARM_COMPUTE_RETURN_ERROR_ON(min < 0 || min > max); @@ -57,8 +60,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con if(output->total_size() != 0) { + const TensorShape ref_shape = output->tensor_shape(); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input, gemm_3d_depth); + // Check in case of mismatching dimensions when permuting, usually in case of 1x1xC input shapes + if(output_shape.num_dimensions() != ref_shape.num_dimensions() && ref_shape.num_dimensions() < 4) + { + for(unsigned int i = output_shape.num_dimensions(); i < ref_shape.num_dimensions(); ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(ref_shape[i] != 1); + } + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape() != output_shape); + } ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); } return Status{}; @@ -72,7 +88,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen constexpr unsigned int num_elems_processed_per_iteration = 1; // Configure kernel window - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); @@ -81,10 +97,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen if(output->total_size() != 0) { - AccessWindowHorizontal output_result_access(output, 0, num_elems_processed_per_iteration); - window_changed = window_changed || update_window_and_padding(win, output_result_access); - - output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); } if(bias != nullptr) @@ -144,16 +157,15 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window ARM_COMPUTE_UNUSED(min_u8); ARM_COMPUTE_UNUSED(max_u8); - const int window_step_x = 16; - const auto window_start_x = static_cast(window.x().start()); - const auto window_end_x = static_cast(window.x().end()); - - Window win(window); - win.set(Window::DimX, Window::Dimension(0, 1, 1)); + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const unsigned int gemm_3d_height = _input->info()->tensor_shape().y() / _gemm_3d_depth; - Iterator in(_input, win); - Iterator out(_output, win); + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + Iterator in(_input, win_collapsed); if(_bias != nullptr) { Window win_biases; @@ -161,8 +173,18 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); Iterator bias(_bias, win_biases); - execute_window_loop(win, [&](const Coordinates & id) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { + // Calculate output coordinates + Coordinates out_coords = id; + if(_gemm_3d_depth != 1) + { + out_coords.set(Window::DimY, id.y() % gemm_3d_height); + out_coords.set(Window::DimZ, id.y() / gemm_3d_height); + out_coords.set(3, id.z()); + } + uint8_t *out_ptr = _output->ptr_to_element(out_coords); + // Compute 16 elements per iteration int x = window_start_x; for(; x <= (window_end_x - window_step_x); x += window_step_x) @@ -193,7 +215,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]); in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]); - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); + vst1q_u8(out_ptr + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); } // Compute left-over elements @@ -206,16 +228,26 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window in_value += bias_value; // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), - static_cast(_max)); + *(out_ptr + x) = finalize_quantization(vdupq_n_s32(in_value), _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), + static_cast(_max)); } }, - in, bias, out); + in, bias); } else { - execute_window_loop(win, [&](const Coordinates & id) + execute_window_loop(win_collapsed, [&](const Coordinates & id) { + // Calculate output coordinates + Coordinates out_coords = id; + if(_gemm_3d_depth != 1) + { + out_coords.set(Window::DimY, id.y() % _gemm_3d_depth); + out_coords.set(Window::DimZ, id.y() / _gemm_3d_depth); + out_coords.set(3, id.z()); + } + uint8_t *out_ptr = _output->ptr_to_element(out_coords); + // Compute 16 elements per iteration int x = window_start_x; for(; x <= (window_end_x - window_step_x); x += window_step_x) @@ -230,7 +262,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window } }; - vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); + vst1q_u8(out_ptr + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8)); } // Compute left-over elements @@ -239,32 +271,30 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run(const Window const int32x4_t in_s32 = vld1q_dup_s32(reinterpret_cast(in.ptr()) + x); // Finalize and store the result - *(out.ptr() + x) = finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), static_cast(_max)); + *(out_ptr + x) = finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, static_cast(_min), static_cast(_max)); } }, - in, out); + in); } } NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() - : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0) + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0), _min(0), _max(0), _gemm_3d_depth(1) { } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) + int result_offset_after_shift, int min, int max, unsigned int gemm_3d_depth) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8)); + const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input->info(), gemm_3d_depth); + auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8).set_tensor_shape(output_shape)); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), - (bias != nullptr) ? bias->info() : nullptr, - output->info(), - min, - max)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), + min, max, gemm_3d_depth)); _input = input; _bias = bias; @@ -274,6 +304,7 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const _result_offset_after_shift = result_offset_after_shift; _min = min; _max = max; + _gemm_3d_depth = gemm_3d_depth; // Configure kernel window auto win_config = validate_and_configure_window(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info()); @@ -285,10 +316,10 @@ void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run; } -Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max, unsigned int gemm_3d_depth) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max, gemm_3d_depth)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (bias != nullptr) ? bias->clone().get() : nullptr, output->clone().get()) diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp index 60e885c9d2..55b70ff193 100644 --- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp +++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp @@ -90,8 +90,8 @@ void NEConvolutionLayerReshapeWeights::run() NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr &memory_manager) : _memory_group(memory_manager), _reshape_weights(), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(memory_manager), _gemmlowp_output_stage(), _col2im_kernel(), _activationlayer_function(), - _add_bias_kernel(), _reshape_layer(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), - _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false) + _add_bias_kernel(), _original_weights(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _tmp_output(), _data_layout(DataLayout::NCHW), _append_bias(false), _skip_im2col(false), + _skip_col2im(false), _is_quantized(false), _is_activationlayer_enabled(false), _is_prepared(false) { } @@ -265,7 +265,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig const DataType gemm_data_type = _is_quantized ? DataType::S32 : data_type; // FIXME: input->clone() doesn't work with subtensors for grouped convolutions. TensorInfo info_gemm(shape_gemm, 1, gemm_data_type); - info_gemm.set_quantization_info(output->info()->quantization_info()); + info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout()); _gemm_output.allocator()->init(info_gemm); _memory_group.manage(&_gemm_output); @@ -284,33 +284,29 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig // Configure output stage for quantized case if(_is_quantized) { + const bool skip_reshape = data_layout == DataLayout::NHWC; const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info(); float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale; int output_multiplier, output_shift; quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - _memory_group.manage(&_tmp_output); - gemm_output_staged_to_use = &_tmp_output; + if(!skip_reshape) + { + _memory_group.manage(&_tmp_output); + gemm_output_staged_to_use = &_tmp_output; + } - _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset); + _gemmlowp_output_stage.configure(gemm_output_to_use, biases, gemm_output_staged_to_use, output_multiplier, output_shift, output_quant_info.offset, 0, 0, skip_reshape ? conv_h : 1); } - if(!_skip_col2im) + if(!_skip_col2im && _data_layout == DataLayout::NCHW) { - if(_data_layout == DataLayout::NCHW) - { - // Configure col2im - _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h)); - } - else - { - // Configure reshape layer - _reshape_layer.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output); - } + // Configure col2im + _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h)); } - if(_is_quantized) + if(_is_quantized && data_layout == DataLayout::NCHW) { _tmp_output.allocator()->allocate(); } @@ -452,7 +448,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI const DataType gemm_data_type = is_quantized ? DataType::S32 : data_type; // GEMM output should be S32 for acquiring raw integer accumulator without quantized postprocessing for quantized asymmetric input. info_gemm = TensorInfo(shape_gemm, 1, gemm_data_type); - info_gemm.set_quantization_info(output->quantization_info()); + info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout()); gemm_output_to_use = &info_gemm; } @@ -461,16 +457,20 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI if(is_quantized) { - float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale; - int output_multiplier, output_shift; + const bool skip_reshape = data_layout == DataLayout::NHWC; + const float multiplier = input->quantization_info().scale * weights_to_use->quantization_info().scale / output->quantization_info().scale; + int output_multiplier, output_shift; quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift); - tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8); - tmp_info.set_quantization_info(output->quantization_info()); - gemm_output_staged_to_use = &tmp_info; + if(!skip_reshape) + { + tmp_info = TensorInfo(gemm_output_to_use->tensor_shape(), 1, DataType::QASYMM8); + tmp_info.set_quantization_info(output->quantization_info()); + gemm_output_staged_to_use = &tmp_info; + } // Validate output stage for quantized case - NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, output->quantization_info().offset); + NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, 0, 0, skip_reshape ? conv_h : 1); } // Validate Col2Im/ReshapeLayer @@ -524,16 +524,9 @@ void NEGEMMConvolutionLayer::run() } // Reshape output matrix - if(!_skip_col2im) + if(!_skip_col2im && _data_layout == DataLayout::NCHW) { - if(_data_layout == DataLayout::NCHW) - { - NEScheduler::get().schedule(&_col2im_kernel, Window::DimY); - } - else - { - _reshape_layer.run(); - } + NEScheduler::get().schedule(&_col2im_kernel, Window::DimY); } if(_is_activationlayer_enabled) diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp index 8c02436bec..cb7004992b 100644 --- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp +++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -43,14 +43,14 @@ Status NEGEMMLowpQuantizeDownInt32ToUint8Scale::validate(const ITensorInfo *inpu } void NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_fixedpoint_multiplier, int result_shift, - int result_offset_after_shift, int min, int max) + int result_offset_after_shift, int min, int max, unsigned int gemm_3d_depth) { auto k = arm_compute::support::cpp14::make_unique(); - k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max); + k->configure(input, bias, output, result_fixedpoint_multiplier, result_shift, result_offset_after_shift, min, max, gemm_3d_depth); _kernel = std::move(k); } -Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max) +Status NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max, unsigned int gemm_3d_depth) { - return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max); + return NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(input, bias, output, min, max, gemm_3d_depth); } \ No newline at end of file -- cgit v1.2.1