diff options
Diffstat (limited to 'src/core/NEON')
4 files changed, 359 insertions, 140 deletions
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp index 2ceb39d217..b924d9f8bd 100644 --- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,40 +37,9 @@ using namespace arm_compute; -NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel() - : _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias() -{ -} - -void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) -{ - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); - ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); - ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2)); - ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0))); - - _input = input; - _output = output; - _kernel_dims = kernel_dims; - _conv_info = conv_info; - _has_bias = has_bias; - - // Configure kernel window - Window win = calculate_max_window(*input->info(), Steps()); - - // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped - output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); - - INEKernel::configure(win); -} - -void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info) +template <typename T> +void NEDepthwiseIm2ColKernel::run_generic(const Window &window) { - ARM_COMPUTE_UNUSED(info); - ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - - //const int kernel_depth = _input->info()->dimension(2); const int input_w = _input->info()->dimension(0); const int input_h = _input->info()->dimension(1); const int input_stride_x = _input->info()->strides_in_bytes().x(); @@ -101,6 +70,13 @@ void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info) const int full_length = input_w + pad_left + pad_right; const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1); + // Define pad value + auto zero = static_cast<T>(0); + if(std::is_same<T, uint8_t>::value) + { + zero = _input->info()->quantization_info().offset; + } + execute_window_loop(window_out, [&](const Coordinates & id) { const int src_pixel_linear = id.y() * stride_x; @@ -110,7 +86,7 @@ void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info) // Get pointers const uint8_t *const input_ptr = in.ptr() + id.z() * input_stride_z; - auto output_ptr = reinterpret_cast<float *>(out.ptr()); + auto output_ptr = reinterpret_cast<T *>(out.ptr()); const int height = src_y + _kernel_dims.height; const int width = src_x + _kernel_dims.width; @@ -120,19 +96,76 @@ void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info) { if(x < 0 || x >= input_w || y < 0 || y >= input_h) { - *output_ptr = 0; + *output_ptr = zero; } else { - *output_ptr = *(reinterpret_cast<const float *>(input_ptr + x * input_stride_x + y * input_stride_y)); + *output_ptr = *(reinterpret_cast<const T *>(input_ptr + x * input_stride_x + y * input_stride_y)); } } } if(_has_bias) { - *output_ptr = static_cast<float>(1); + *output_ptr = static_cast<T>(1); } }, in, out); } + +NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias() +{ +} + +void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0))); + + _input = input; + _output = output; + _kernel_dims = kernel_dims; + _conv_info = conv_info; + _has_bias = has_bias; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // Set appropriate function to run + switch(input->info()->data_type()) + { + case DataType::QASYMM8: + _func = &NEDepthwiseIm2ColKernel::run_generic<uint8_t>; + break; + case DataType::F16: + _func = &NEDepthwiseIm2ColKernel::run_generic<half>; + break; + case DataType::F32: + _func = &NEDepthwiseIm2ColKernel::run_generic<float>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + + // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + if(_func != nullptr) + { + (this->*_func)(window); + } +} diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp index 9b36df3c39..8960d8a8af 100644 --- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,14 +37,46 @@ using namespace arm_compute; +template <typename T> +void NEDepthwiseVectorToTensorKernel::vector_to_tensor(const Window &window) +{ + // const int input_w = _input->info()->dimension(0); + const int output_stride_x = _output->info()->strides_in_bytes().x(); + const int output_stride_y = _output->info()->strides_in_bytes().y(); + const int output_stride_z = _output->info()->strides_in_bytes().z(); + + // Setup output window + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Iterator in(_input, window); + Iterator out(_output, window_out); + + const int patch_size = _conv_dims.first * _conv_dims.second; + + execute_window_loop(window, [&](const Coordinates & id) + { + const int z = id.x() / patch_size; + const int index2D = id.x() - z * patch_size; + + auto input_ptr = reinterpret_cast<T *>(in.ptr()); + auto output_ptr = reinterpret_cast<T *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z); + + *output_ptr = *input_ptr; + }, + in, out); +} + NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel() - : _input(nullptr), _output(nullptr), _conv_dims() + : _func(nullptr), _input(nullptr), _output(nullptr), _conv_dims() { } void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); TensorShape output_shape = input->info()->tensor_shape(); @@ -53,7 +85,7 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h)); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -63,6 +95,25 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o _output = output; _conv_dims = std::pair<size_t, size_t>(conv_w, conv_h); + // Set appropriate function to run + switch(input->info()->data_type()) + { + case DataType::QASYMM8: + _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<uint8_t>; + break; + case DataType::S32: + _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<int32_t>; + break; + case DataType::F16: + _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<half>; + break; + case DataType::F32: + _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<float>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); // The NEDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped @@ -75,32 +126,10 @@ void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - // const int input_w = _input->info()->dimension(0); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - const int output_stride_y = _output->info()->strides_in_bytes().y(); - const int output_stride_z = _output->info()->strides_in_bytes().z(); - - // Setup output window - Window window_out(window); - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - - Iterator in(_input, window); - Iterator out(_output, window_out); - - const int patch_size = _conv_dims.first * _conv_dims.second; - - execute_window_loop(window, [&](const Coordinates & id) + if(_func != nullptr) { - const int z = id.x() / patch_size; - const int index2D = id.x() - z * patch_size; - - auto input_ptr = reinterpret_cast<float *>(in.ptr()); - auto output_ptr = reinterpret_cast<float *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z); - - *output_ptr = *input_ptr; - }, - in, out); + (this->*_func)(window); + } } diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp index 6585fdb8b8..36b17bfc4c 100644 --- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp +++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -37,16 +37,59 @@ using namespace arm_compute; +namespace +{ +template <typename T> +void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window) +{ + const int input_w = input->info()->dimension(0); + const int output_stride_x = output->info()->strides_in_bytes().x(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + + Window window_in(window); + // The first three dimensions of the input are increased by the inner loops + window_in.set(Window::DimX, Window::Dimension(0, input->info()->dimension(0), input->info()->dimension(0))); + window_in.set(Window::DimY, Window::Dimension(0, input->info()->dimension(1), 1)); + window_in.set(Window::DimZ, Window::Dimension(0, input->info()->dimension(2), 1)); + + // Setup output window + Window window_out; + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(input, window_in); + Iterator out(output, window_out); + + execute_window_loop(window_in, [&](const Coordinates & id) + { + auto input_ptr = reinterpret_cast<T *>(in.ptr()); + auto output_ptr = reinterpret_cast<T *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y); + + for(int i = 0; i < input_w; ++i, ++input_ptr) + { + *(output_ptr + i) = *input_ptr; + } + + if(bias != nullptr) + { + *(output_ptr + input_w) = *(reinterpret_cast<T *>(bias->ptr_to_element(Coordinates(id.z())))); + } + }, + in, out); +} +} // namespace + NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel() - : _input(nullptr), _output(nullptr), _biases(nullptr) + : _func(nullptr), _input(nullptr), _output(nullptr), _biases(nullptr) { } void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *output, const ITensor *biases) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr)); ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1)); ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0))); @@ -62,6 +105,30 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o _output = output; _biases = biases; + switch(_input->info()->element_size()) + { + case 4: + { + _func = &weights_reshape<uint32_t>; + break; + } + case 2: + { + _func = &weights_reshape<uint16_t>; + break; + } + case 1: + { + _func = &weights_reshape<uint8_t>; + break; + } + default: + { + ARM_COMPUTE_ERROR_ON("Element size not supported"); + break; + } + } + // Configure kernel window Window win = calculate_max_window(*input->info(), Steps()); // The NEDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped @@ -74,39 +141,10 @@ void NEDepthwiseWeightsReshapeKernel::run(const Window &window, const ThreadInfo { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); - const int input_w = _input->info()->dimension(0); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - const int output_stride_y = _output->info()->strides_in_bytes().y(); - - Window window_in(window); - // The first three dimensions of the input are increased by the inner loops - window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); - window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1)); - window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1)); - - // Setup output window - Window window_out; - window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); - window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - - Iterator in(_input, window_in); - Iterator out(_output, window_out); - - execute_window_loop(window_in, [&](const Coordinates & id) + if(_func != nullptr) { - auto input_ptr = reinterpret_cast<float *>(in.ptr()); - auto output_ptr = reinterpret_cast<float *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y); - - for(int i = 0; i < input_w; ++i, ++input_ptr) - { - *(output_ptr + i) = *input_ptr; - } - - if(_biases != nullptr) - { - *(output_ptr + input_w) = *(reinterpret_cast<float *>(_biases->ptr_to_element(Coordinates(id.z())))); - } - }, - in, out); + (*_func)(_input, _biases, _output, window); + } } diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp index fe79df2528..c1e975e77e 100644 --- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp +++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -39,24 +39,170 @@ using namespace arm_compute; +template <typename I0, typename I1, typename O> +void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out) +{ + ARM_COMPUTE_ERROR("Unsupported data types"); + ARM_COMPUTE_UNUSED(window_in); + ARM_COMPUTE_UNUSED(window_w); + ARM_COMPUTE_UNUSED(window_out); +} + +namespace arm_compute +{ +template <> +void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in, + const Window &window_w, + const Window &window_out) +{ + Iterator in(_input0, window_in); + Iterator in2(_input1, window_w); + Iterator out(_output, window_out); + + const int input_w = _input0->info()->dimension(0); + const int input_h = _input0->info()->dimension(1); + const int input_stride_x = _input0->info()->strides_in_bytes().x(); + const int weights_stride_x = _input1->info()->strides_in_bytes().x(); + const int weights_stride_y = _input1->info()->strides_in_bytes().y(); + const int output_stride_x = _output->info()->strides_in_bytes().x(); + + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; + auto output_ptr = reinterpret_cast<float *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); + + float32x4_t row_dot = vdupq_n_f32(0.f); + for(int i = 0; i < input_w; i += 4) + { + const auto input = vld1q_f32(reinterpret_cast<const float *>(input_ptr + i * input_stride_x)); + const auto weights = vld1q_f32(reinterpret_cast<const float *>(weights_ptr + i * weights_stride_x)); + row_dot = vaddq_f32(row_dot, vmulq_f32(input, weights)); + } + + auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot)); + temp = vpadd_f32(temp, temp); + + *output_ptr = vget_lane_f32(temp, 0); + }, + in, in2, out); +} + +template <> +void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>(const Window &window_in, + const Window &window_w, + const Window &window_out) +{ + Iterator in(_input0, window_in); + Iterator in2(_input1, window_w); + Iterator out(_output, window_out); + + const int input_offset = -_input0->info()->quantization_info().offset; + const int weights_offset = -_input1->info()->quantization_info().offset; + + const int input_w = _input0->info()->dimension(0); + const int input_h = _input0->info()->dimension(1); + const int input_stride_x = _input0->info()->strides_in_bytes().x(); + const int weights_stride_x = _input1->info()->strides_in_bytes().x(); + const int weights_stride_y = _input1->info()->strides_in_bytes().y(); + const int output_stride_x = _output->info()->strides_in_bytes().x(); + const int read_step = 16 / _input0->info()->element_size(); + + const int32x4_t v_input_offset = vdupq_n_s32(input_offset); + const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); + + execute_window_loop(window_in, [&](const Coordinates & id) + { + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; + auto output_ptr = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); + + int32x4_t row_dot = vdupq_n_s32(0); + for(int i = 0; i < input_w; i += read_step) + { + // Read values + const auto input = vld1q_u8(reinterpret_cast<const uint8_t *>(input_ptr + i * input_stride_x)); + const auto weights = vld1q_u8(reinterpret_cast<const uint8_t *>(weights_ptr + i * weights_stride_x)); + + // Add offsets + const int32x4x4_t input_s32 = + { + { + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(input))))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(input))))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(input))))), + vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(input))))) + } + }; + const int32x4x4_t weights_s32 = + { + { + vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(weights))))), + vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(weights))))), + vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(weights))))), + vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(weights))))) + } + }; + + // Dot + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0])); + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1])); + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2])); + row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3])); + } + + // Reduction + auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot)); + temp = vpadd_s32(temp, temp); + + *output_ptr = vget_lane_s32(temp, 0); + }, + in, in2, out); +} +} //namespace arm_compute + NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr) + : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0) +{ +} + +BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const { + return _border_size; } void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32)); ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1)); _input0 = input0; _input1 = input1; _output = output; + // Set appropriate function to run + switch(input0->info()->data_type()) + { + case DataType::QASYMM8: + _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>; + break; + case DataType::F32: + _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type"); + } + // Configure kernel window - const unsigned int num_elems_read_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = 16 / _input0->info()->element_size(); + + const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0); + _border_size = BorderSize(0, border_x); Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration)); @@ -75,6 +221,7 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); Window window_slice = window.first_slice_window_3D(); @@ -96,36 +243,8 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); - Iterator in(_input0, window_in); - Iterator in2(_input1, window_weights); - Iterator out(_output, window_out); - - const int input_w = _input0->info()->dimension(0); - const int input_h = _input0->info()->dimension(1); - const int input_stride_x = _input0->info()->strides_in_bytes().x(); - const int weights_stride_x = _input1->info()->strides_in_bytes().x(); - const int weights_stride_y = _input1->info()->strides_in_bytes().y(); - const int output_stride_x = _output->info()->strides_in_bytes().x(); - - execute_window_loop(window_in, [&](const Coordinates & id) + if(_func != nullptr) { - // Get pointers - const uint8_t *const input_ptr = in.ptr(); - const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y; - auto output_ptr = reinterpret_cast<float *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x); - - float32x4_t row_dot = vdupq_n_f32(0.f); - for(int i = 0; i < input_w; i += 4) - { - const auto input = vld1q_f32(reinterpret_cast<const float *>(input_ptr + i * input_stride_x)); - const auto weights = vld1q_f32(reinterpret_cast<const float *>(weights_ptr + i * weights_stride_x)); - row_dot = vaddq_f32(row_dot, vmulq_f32(input, weights)); - } - - auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot)); - temp = vpadd_f32(temp, temp); - - *output_ptr = vget_lane_f32(temp, 0); - }, - in, in2, out); + (this->*_func)(window_in, window_weights, window_out); + } } |