From ac4e873dad6aa6291fc36aff62047a896db04f6a Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 5 Jul 2017 17:02:25 +0100 Subject: COMPMID-417: Port DepthConcatenate to QS8/QS16 for NEON/CL. Change-Id: I3dddae63043c7aa18d908a4fc8abacf3c64f98ca Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80081 Tested-by: Kaizen Reviewed-by: Steven Niu --- src/core/NEON/kernels/NEDepthConcatenateKernel.cpp | 108 ++++++++++++++++----- 1 file changed, 86 insertions(+), 22 deletions(-) (limited to 'src/core/NEON') diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp index 902490ec38..d58e4e0aa5 100644 --- a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp +++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp @@ -27,17 +27,76 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include +#include using namespace arm_compute; +namespace +{ +// Overloads of 128-bit vector loads +uint8x16_t loadq(const uint8_t *ptr) +{ + return vld1q_u8(ptr); +} +uint16x8_t loadq(const uint16_t *ptr) +{ + return vld1q_u16(ptr); +} +uint32x4_t loadq(const uint32_t *ptr) +{ + return vld1q_u32(ptr); +} +// Overloads of 128-bit vector stores +void storeq(uint8_t *ptr, uint8x16_t val) +{ + return vst1q_u8(ptr, val); +} +void storeq(uint16_t *ptr, uint16x8_t val) +{ + return vst1q_u16(ptr, val); +} +void storeq(uint32_t *ptr, uint32x4_t val) +{ + return vst1q_u32(ptr, val); +} + +template +void depth_concat(const ITensor *in, ITensor *out, std::pair start_xy, int depth_offset, const Window &window) +{ + const int start_x = start_xy.first; + const int start_y = start_xy.second; + + // Offset input + const int input_offset_to_first_elements_in_bytes = in->info()->offset_first_element_in_bytes() - start_x * in->info()->strides_in_bytes()[0] - start_y * in->info()->strides_in_bytes()[1]; + uint8_t *input_ptr = in->buffer() + input_offset_to_first_elements_in_bytes; + + // Offset output + const unsigned int output_offset_to_first_elements_in_bytes = out->info()->offset_first_element_in_bytes() + depth_offset * out->info()->strides_in_bytes()[2]; + uint8_t *output_ptr = out->buffer() + output_offset_to_first_elements_in_bytes; + + Iterator input(in, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input_ptr + input.offset()); + const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); + + storeq(out_ptr, loadq(in_ptr)); + }, + input, output); +} +} // namespace + NEDepthConcatenateKernel::NEDepthConcatenateKernel() - : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) + : _func(nullptr), _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) { } @@ -48,8 +107,9 @@ BorderSize NEDepthConcatenateKernel::border_size() const void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); @@ -60,18 +120,36 @@ void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int dept ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + _func = nullptr; _input = input; _output = output; _depth_offset = depth_offset; _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; - const unsigned int num_elems_processed_per_iteration = 4; - const unsigned int num_elems_read_per_iteration = 4; + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &depth_concat; + break; + case DataType::QS16: + case DataType::F16: + _func = &depth_concat; + break; + case DataType::F32: + _func = &depth_concat; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const unsigned int num_elems_read_per_iteration = 16 / input->info()->element_size(); const unsigned int num_rows_read_per_iteration = 1; // The window needs to be based on input as we copy all the depths of input - Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size()); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, input->info()->tensor_shape().z(), 1)); AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); @@ -85,21 +163,7 @@ void NEDepthConcatenateKernel::run(const Window &window) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); - // Offset output - const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom * - _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2]; - uint8_t *output_ptr = _output->buffer() + offset_to_first_elements_in_bytes; - - Iterator input(_input, window); - Iterator output(_output, window); - - execute_window_loop(window, [&](const Coordinates & id) - { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); - - vst1q_f32(out_ptr, vld1q_f32(in_ptr)); - }, - input, output); + (*_func)(_input, _output, std::make_pair(_left_right, _top_bottom), _depth_offset, window); } -- cgit v1.2.1