From c28d42837b2aea09738a7df00653d623c3c53420 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Wed, 4 Mar 2020 15:30:41 +0000 Subject: COMPMID-3156: Remove padding from NEDepthConcatenateLayerKernel Change-Id: I875a116a2527f19774c80e0da3153264564c960d Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2829 Tested-by: Arm Jenkins Reviewed-by: Michele Di Giorgio --- .../NEON/kernels/NEDepthConcatenateLayerKernel.h | 4 +- .../NEON/kernels/NEDepthConcatenateLayerKernel.cpp | 94 +++++++++++++--------- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h index ddbd0983e4..6690ac2236 100644 --- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -79,7 +79,7 @@ public: void run(const Window &window, const ThreadInfo &info) override; private: - using DepthConcatFunction = void(const ITensor *in, ITensor *out, int depth_offset, const Window &window); + using DepthConcatFunction = void(const ITensor *in, ITensor *out, unsigned int depth_offset, const Window &window); private: DepthConcatFunction *_func; diff --git a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp index 56ab11415c..3ac043ad42 100644 --- a/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConcatenateLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 ARM Limited. + * Copyright (c) 2017-2020 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,12 +37,12 @@ #include -using namespace arm_compute; - +namespace arm_compute +{ namespace { template -void depth_concat(const ITensor *in, ITensor *out, int depth_offset, const Window &window) +void depth_concat(const ITensor *in, ITensor *out, unsigned int depth_offset, const Window &window) { // Offset input uint8_t *input_ptr = in->buffer() + in->info()->offset_first_element_in_bytes(); @@ -50,64 +50,81 @@ void depth_concat(const ITensor *in, ITensor *out, int depth_offset, const Windo // Offset output uint8_t *output_ptr = out->buffer() + out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2]; - Iterator input(in, window); - Iterator output(out, window); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const int window_step_x = 16 / out->info()->element_size(); + + Window win{ window }; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimZ, Window::Dimension(0, in->info()->tensor_shape().z(), 1)); + + Iterator input(in, win); + Iterator output(out, win); const DataType dt = in->info()->data_type(); const UniformQuantizationInfo input_qinfo = in->info()->quantization_info().uniform(); const UniformQuantizationInfo output_qinfo = out->info()->quantization_info().uniform(); if(dt == DataType::QASYMM8 && input_qinfo != output_qinfo) { - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win, [&](const Coordinates &) { const auto in_ptr = reinterpret_cast(input_ptr + input.offset()); const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); - vst1q_u8(out_ptr, vquantize(vdequantize(vld1q_u8(in_ptr), input_qinfo), output_qinfo)); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), input_qinfo), output_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), input_qinfo), output_qinfo); + } }, input, output); } else if(dt == DataType::QASYMM8_SIGNED && input_qinfo != output_qinfo) { - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win, [&](const Coordinates &) { const auto in_ptr = reinterpret_cast(input_ptr + input.offset()); const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); - vst1q_s8(out_ptr, vquantize_signed(vdequantize(vld1q_s8(in_ptr), input_qinfo), output_qinfo)); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), input_qinfo), output_qinfo)); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), input_qinfo), output_qinfo); + } }, input, output); } else { - execute_window_loop(window, [&](const Coordinates &) + execute_window_loop(win, [&](const Coordinates &) { const auto in_ptr = reinterpret_cast(input_ptr + input.offset()); const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); - - wrapper::vstore(out_ptr, wrapper::vloadq(in_ptr)); + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x)); + } + // Compute left-over elements + for(; x < window_end_x; ++x) + { + *(out_ptr + x) = *(in_ptr + x); + } }, input, output); } } -std::pair validate_and_configure_window(ITensorInfo *input, unsigned int depth_offset, ITensorInfo *output) -{ - ARM_COMPUTE_UNUSED(depth_offset); - - const unsigned int num_elems_processed_per_iteration = 16 / input->element_size(); - - // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration)); - win.set(Window::DimZ, Window::Dimension(0, input->tensor_shape().z(), 1)); - - AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration); - AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration); - bool window_changed = update_window_and_padding(win, input_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); - - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; - return std::make_pair(err, win); -} - Status validate_arguments(const ITensorInfo *input, unsigned int depth_offset, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); @@ -158,13 +175,12 @@ void NEDepthConcatenateLayerKernel::configure(const ITensor *input, unsigned int } // Configure kernel window - auto win_config = validate_and_configure_window(input->info(), depth_offset, output->info()); - ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); - - INEKernel::configure(std::get<1>(win_config)); + Window win = calculate_max_window(*output->info(), Steps()); + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); - // Set output valid region - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + INEKernel::configure(win); } Status NEDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *input, @@ -172,7 +188,6 @@ Status NEDepthConcatenateLayerKernel::validate(const arm_compute::ITensorInfo *i const arm_compute::ITensorInfo *output) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, depth_offset, output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), depth_offset, output->clone().get()).first); return Status{}; } @@ -185,3 +200,4 @@ void NEDepthConcatenateLayerKernel::run(const Window &window, const ThreadInfo & (*_func)(_input, _output, _depth_offset, window); } +} // namespace arm_compute -- cgit v1.2.1