aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-01-22 16:29:17 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:45:00 +0000
commitd05dce46a14a7b67f322328ecd95bf96bdd30bae (patch)
tree6e001f539969a1a669241a72e78ff5a62998a984 /src/core/NEON/kernels
parent5d9d019b2c7ca3dc59bfbb44b3169ee5cd71dc79 (diff)
downloadComputeLibrary-d05dce46a14a7b67f322328ecd95bf96bdd30bae.tar.gz
COMPMID-791: Generic Depthwise Convolution Layer NEON QASYMM8
Change-Id: I33cf54e68f6c097ac58b6f16c3f9a720978f09cd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/117289 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels')
-rw-r--r--src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp109
-rw-r--r--src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp89
-rw-r--r--src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp110
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp191
4 files changed, 359 insertions, 140 deletions
diff --git a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
index 2ceb39d217..b924d9f8bd 100644
--- a/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseIm2ColKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,40 +37,9 @@
using namespace arm_compute;
-NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
- : _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
-{
-}
-
-void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
- ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
- ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
-
- _input = input;
- _output = output;
- _kernel_dims = kernel_dims;
- _conv_info = conv_info;
- _has_bias = has_bias;
-
- // Configure kernel window
- Window win = calculate_max_window(*input->info(), Steps());
-
- // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
- output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
- INEKernel::configure(win);
-}
-
-void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEDepthwiseIm2ColKernel::run_generic(const Window &window)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-
- //const int kernel_depth = _input->info()->dimension(2);
const int input_w = _input->info()->dimension(0);
const int input_h = _input->info()->dimension(1);
const int input_stride_x = _input->info()->strides_in_bytes().x();
@@ -101,6 +70,13 @@ void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
const int full_length = input_w + pad_left + pad_right;
const int max_initial_x = stride_x * (((full_length - _kernel_dims.width) / stride_x) + 1);
+ // Define pad value
+ auto zero = static_cast<T>(0);
+ if(std::is_same<T, uint8_t>::value)
+ {
+ zero = _input->info()->quantization_info().offset;
+ }
+
execute_window_loop(window_out, [&](const Coordinates & id)
{
const int src_pixel_linear = id.y() * stride_x;
@@ -110,7 +86,7 @@ void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
// Get pointers
const uint8_t *const input_ptr = in.ptr() + id.z() * input_stride_z;
- auto output_ptr = reinterpret_cast<float *>(out.ptr());
+ auto output_ptr = reinterpret_cast<T *>(out.ptr());
const int height = src_y + _kernel_dims.height;
const int width = src_x + _kernel_dims.width;
@@ -120,19 +96,76 @@ void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
{
if(x < 0 || x >= input_w || y < 0 || y >= input_h)
{
- *output_ptr = 0;
+ *output_ptr = zero;
}
else
{
- *output_ptr = *(reinterpret_cast<const float *>(input_ptr + x * input_stride_x + y * input_stride_y));
+ *output_ptr = *(reinterpret_cast<const T *>(input_ptr + x * input_stride_x + y * input_stride_y));
}
}
}
if(_has_bias)
{
- *output_ptr = static_cast<float>(1);
+ *output_ptr = static_cast<T>(1);
}
},
in, out);
}
+
+NEDepthwiseIm2ColKernel::NEDepthwiseIm2ColKernel()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _kernel_dims(), _conv_info(), _has_bias()
+{
+}
+
+void NEDepthwiseIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+{
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && has_bias);
+ ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (kernel_dims.width * kernel_dims.height + ((has_bias) ? 1 : 0)));
+
+ _input = input;
+ _output = output;
+ _kernel_dims = kernel_dims;
+ _conv_info = conv_info;
+ _has_bias = has_bias;
+
+ // Configure kernel window
+ Window win = calculate_max_window(*input->info(), Steps());
+
+ // Set appropriate function to run
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &NEDepthwiseIm2ColKernel::run_generic<uint8_t>;
+ break;
+ case DataType::F16:
+ _func = &NEDepthwiseIm2ColKernel::run_generic<half>;
+ break;
+ case DataType::F32:
+ _func = &NEDepthwiseIm2ColKernel::run_generic<float>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+
+ // The NEDepthwiseIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
+ output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape()));
+
+ INEKernel::configure(win);
+}
+
+void NEDepthwiseIm2ColKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ if(_func != nullptr)
+ {
+ (this->*_func)(window);
+ }
+}
diff --git a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
index 9b36df3c39..8960d8a8af 100644
--- a/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseVectorToTensorKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,14 +37,46 @@
using namespace arm_compute;
+template <typename T>
+void NEDepthwiseVectorToTensorKernel::vector_to_tensor(const Window &window)
+{
+ // const int input_w = _input->info()->dimension(0);
+ const int output_stride_x = _output->info()->strides_in_bytes().x();
+ const int output_stride_y = _output->info()->strides_in_bytes().y();
+ const int output_stride_z = _output->info()->strides_in_bytes().z();
+
+ // Setup output window
+ Window window_out(window);
+ window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+ window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+ Iterator in(_input, window);
+ Iterator out(_output, window_out);
+
+ const int patch_size = _conv_dims.first * _conv_dims.second;
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ const int z = id.x() / patch_size;
+ const int index2D = id.x() - z * patch_size;
+
+ auto input_ptr = reinterpret_cast<T *>(in.ptr());
+ auto output_ptr = reinterpret_cast<T *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z);
+
+ *output_ptr = *input_ptr;
+ },
+ in, out);
+}
+
NEDepthwiseVectorToTensorKernel::NEDepthwiseVectorToTensorKernel()
- : _input(nullptr), _output(nullptr), _conv_dims()
+ : _func(nullptr), _input(nullptr), _output(nullptr), _conv_dims()
{
}
void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *output, size_t conv_w, size_t conv_h)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_NULLPTR(output);
TensorShape output_shape = input->info()->tensor_shape();
@@ -53,7 +85,7 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o
output_shape.set(2, input->info()->tensor_shape()[0] / (conv_w * conv_h));
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -63,6 +95,25 @@ void NEDepthwiseVectorToTensorKernel::configure(const ITensor *input, ITensor *o
_output = output;
_conv_dims = std::pair<size_t, size_t>(conv_w, conv_h);
+ // Set appropriate function to run
+ switch(input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<uint8_t>;
+ break;
+ case DataType::S32:
+ _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<int32_t>;
+ break;
+ case DataType::F16:
+ _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<half>;
+ break;
+ case DataType::F32:
+ _func = &NEDepthwiseVectorToTensorKernel::vector_to_tensor<float>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
// The NEDepthwisevectorToTensorKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -75,32 +126,10 @@ void NEDepthwiseVectorToTensorKernel::run(const Window &window, const ThreadInfo
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- // const int input_w = _input->info()->dimension(0);
- const int output_stride_x = _output->info()->strides_in_bytes().x();
- const int output_stride_y = _output->info()->strides_in_bytes().y();
- const int output_stride_z = _output->info()->strides_in_bytes().z();
-
- // Setup output window
- Window window_out(window);
- window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
- window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
- Iterator in(_input, window);
- Iterator out(_output, window_out);
-
- const int patch_size = _conv_dims.first * _conv_dims.second;
-
- execute_window_loop(window, [&](const Coordinates & id)
+ if(_func != nullptr)
{
- const int z = id.x() / patch_size;
- const int index2D = id.x() - z * patch_size;
-
- auto input_ptr = reinterpret_cast<float *>(in.ptr());
- auto output_ptr = reinterpret_cast<float *>(out.ptr() + index2D % _conv_dims.first * output_stride_x + index2D / _conv_dims.first * output_stride_y + z * output_stride_z);
-
- *output_ptr = *input_ptr;
- },
- in, out);
+ (this->*_func)(window);
+ }
}
diff --git a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
index 6585fdb8b8..36b17bfc4c 100644
--- a/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthwiseWeightsReshapeKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,16 +37,59 @@
using namespace arm_compute;
+namespace
+{
+template <typename T>
+void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window)
+{
+ const int input_w = input->info()->dimension(0);
+ const int output_stride_x = output->info()->strides_in_bytes().x();
+ const int output_stride_y = output->info()->strides_in_bytes().y();
+
+ Window window_in(window);
+ // The first three dimensions of the input are increased by the inner loops
+ window_in.set(Window::DimX, Window::Dimension(0, input->info()->dimension(0), input->info()->dimension(0)));
+ window_in.set(Window::DimY, Window::Dimension(0, input->info()->dimension(1), 1));
+ window_in.set(Window::DimZ, Window::Dimension(0, input->info()->dimension(2), 1));
+
+ // Setup output window
+ Window window_out;
+ window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+ window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+ Iterator in(input, window_in);
+ Iterator out(output, window_out);
+
+ execute_window_loop(window_in, [&](const Coordinates & id)
+ {
+ auto input_ptr = reinterpret_cast<T *>(in.ptr());
+ auto output_ptr = reinterpret_cast<T *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y);
+
+ for(int i = 0; i < input_w; ++i, ++input_ptr)
+ {
+ *(output_ptr + i) = *input_ptr;
+ }
+
+ if(bias != nullptr)
+ {
+ *(output_ptr + input_w) = *(reinterpret_cast<T *>(bias->ptr_to_element(Coordinates(id.z()))));
+ }
+ },
+ in, out);
+}
+} // namespace
+
NEDepthwiseWeightsReshapeKernel::NEDepthwiseWeightsReshapeKernel()
- : _input(nullptr), _output(nullptr), _biases(nullptr)
+ : _func(nullptr), _input(nullptr), _output(nullptr), _biases(nullptr)
{
}
void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *output, const ITensor *biases)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
+ ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input->info()->data_type()) && (biases != nullptr));
ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) != output->info()->dimension(1));
ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != (input->info()->dimension(0) * input->info()->dimension(1) + ((biases != nullptr) ? 1 : 0)));
@@ -62,6 +105,30 @@ void NEDepthwiseWeightsReshapeKernel::configure(const ITensor *input, ITensor *o
_output = output;
_biases = biases;
+ switch(_input->info()->element_size())
+ {
+ case 4:
+ {
+ _func = &weights_reshape<uint32_t>;
+ break;
+ }
+ case 2:
+ {
+ _func = &weights_reshape<uint16_t>;
+ break;
+ }
+ case 1:
+ {
+ _func = &weights_reshape<uint8_t>;
+ break;
+ }
+ default:
+ {
+ ARM_COMPUTE_ERROR_ON("Element size not supported");
+ break;
+ }
+ }
+
// Configure kernel window
Window win = calculate_max_window(*input->info(), Steps());
// The NEDepthwiseWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
@@ -74,39 +141,10 @@ void NEDepthwiseWeightsReshapeKernel::run(const Window &window, const ThreadInfo
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
- const int input_w = _input->info()->dimension(0);
- const int output_stride_x = _output->info()->strides_in_bytes().x();
- const int output_stride_y = _output->info()->strides_in_bytes().y();
-
- Window window_in(window);
- // The first three dimensions of the input are increased by the inner loops
- window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
- window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1));
- window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), 1));
-
- // Setup output window
- Window window_out;
- window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
- window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-
- Iterator in(_input, window_in);
- Iterator out(_output, window_out);
-
- execute_window_loop(window_in, [&](const Coordinates & id)
+ if(_func != nullptr)
{
- auto input_ptr = reinterpret_cast<float *>(in.ptr());
- auto output_ptr = reinterpret_cast<float *>(out.ptr() + id.y() * input_w * output_stride_x + id.z() * output_stride_y);
-
- for(int i = 0; i < input_w; ++i, ++input_ptr)
- {
- *(output_ptr + i) = *input_ptr;
- }
-
- if(_biases != nullptr)
- {
- *(output_ptr + input_w) = *(reinterpret_cast<float *>(_biases->ptr_to_element(Coordinates(id.z()))));
- }
- },
- in, out);
+ (*_func)(_input, _biases, _output, window);
+ }
}
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index fe79df2528..c1e975e77e 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -39,24 +39,170 @@
using namespace arm_compute;
+template <typename I0, typename I1, typename O>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &window_in, const Window &window_w, const Window &window_out)
+{
+ ARM_COMPUTE_ERROR("Unsupported data types");
+ ARM_COMPUTE_UNUSED(window_in);
+ ARM_COMPUTE_UNUSED(window_w);
+ ARM_COMPUTE_UNUSED(window_out);
+}
+
+namespace arm_compute
+{
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>(const Window &window_in,
+ const Window &window_w,
+ const Window &window_out)
+{
+ Iterator in(_input0, window_in);
+ Iterator in2(_input1, window_w);
+ Iterator out(_output, window_out);
+
+ const int input_w = _input0->info()->dimension(0);
+ const int input_h = _input0->info()->dimension(1);
+ const int input_stride_x = _input0->info()->strides_in_bytes().x();
+ const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+ const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+ const int output_stride_x = _output->info()->strides_in_bytes().x();
+
+ execute_window_loop(window_in, [&](const Coordinates & id)
+ {
+ // Get pointers
+ const uint8_t *const input_ptr = in.ptr();
+ const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+ auto output_ptr = reinterpret_cast<float *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+ float32x4_t row_dot = vdupq_n_f32(0.f);
+ for(int i = 0; i < input_w; i += 4)
+ {
+ const auto input = vld1q_f32(reinterpret_cast<const float *>(input_ptr + i * input_stride_x));
+ const auto weights = vld1q_f32(reinterpret_cast<const float *>(weights_ptr + i * weights_stride_x));
+ row_dot = vaddq_f32(row_dot, vmulq_f32(input, weights));
+ }
+
+ auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot));
+ temp = vpadd_f32(temp, temp);
+
+ *output_ptr = vget_lane_f32(temp, 0);
+ },
+ in, in2, out);
+}
+
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>(const Window &window_in,
+ const Window &window_w,
+ const Window &window_out)
+{
+ Iterator in(_input0, window_in);
+ Iterator in2(_input1, window_w);
+ Iterator out(_output, window_out);
+
+ const int input_offset = -_input0->info()->quantization_info().offset;
+ const int weights_offset = -_input1->info()->quantization_info().offset;
+
+ const int input_w = _input0->info()->dimension(0);
+ const int input_h = _input0->info()->dimension(1);
+ const int input_stride_x = _input0->info()->strides_in_bytes().x();
+ const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+ const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+ const int output_stride_x = _output->info()->strides_in_bytes().x();
+ const int read_step = 16 / _input0->info()->element_size();
+
+ const int32x4_t v_input_offset = vdupq_n_s32(input_offset);
+ const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
+
+ execute_window_loop(window_in, [&](const Coordinates & id)
+ {
+ // Get pointers
+ const uint8_t *const input_ptr = in.ptr();
+ const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+ auto output_ptr = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+ int32x4_t row_dot = vdupq_n_s32(0);
+ for(int i = 0; i < input_w; i += read_step)
+ {
+ // Read values
+ const auto input = vld1q_u8(reinterpret_cast<const uint8_t *>(input_ptr + i * input_stride_x));
+ const auto weights = vld1q_u8(reinterpret_cast<const uint8_t *>(weights_ptr + i * weights_stride_x));
+
+ // Add offsets
+ const int32x4x4_t input_s32 =
+ {
+ {
+ vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(input))))),
+ vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(input))))),
+ vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(input))))),
+ vaddw_s16(v_input_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(input)))))
+ }
+ };
+ const int32x4x4_t weights_s32 =
+ {
+ {
+ vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_low_u8(weights))))),
+ vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_low_u8(weights))))),
+ vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vget_high_u8(weights))))),
+ vaddw_s16(v_weights_offset, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(vget_high_u8(weights)))))
+ }
+ };
+
+ // Dot
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0]));
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1]));
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2]));
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3]));
+ }
+
+ // Reduction
+ auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot));
+ temp = vpadd_s32(temp, temp);
+
+ *output_ptr = vget_lane_s32(temp, 0);
+ },
+ in, in2, out);
+}
+} //namespace arm_compute
+
NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr)
+ : _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0)
+{
+}
+
+BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
{
+ return _border_size;
}
void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F32);
+ ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output);
+ ARM_COMPUTE_ERROR_ON(is_data_type_quantized_asymmetric(input0->info()->data_type()) && (output->info()->data_type() != DataType::S32));
ARM_COMPUTE_ERROR_ON(input0->info()->dimension(2) != input1->info()->dimension(1));
_input0 = input0;
_input1 = input1;
_output = output;
+ // Set appropriate function to run
+ switch(input0->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
+ break;
+ case DataType::F32:
+ _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<float, float, float>;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+
// Configure kernel window
- const unsigned int num_elems_read_per_iteration = 4;
+ const unsigned int num_elems_read_per_iteration = 16 / _input0->info()->element_size();
+
+ const unsigned int border_x = ceil_to_multiple(input0->info()->dimension(0), num_elems_read_per_iteration) - input0->info()->dimension(0);
+ _border_size = BorderSize(0, border_x);
Window win = calculate_max_window(*input0->info(), Steps(num_elems_read_per_iteration));
@@ -75,6 +221,7 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf
{
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
Window window_slice = window.first_slice_window_3D();
@@ -96,36 +243,8 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf
window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- Iterator in(_input0, window_in);
- Iterator in2(_input1, window_weights);
- Iterator out(_output, window_out);
-
- const int input_w = _input0->info()->dimension(0);
- const int input_h = _input0->info()->dimension(1);
- const int input_stride_x = _input0->info()->strides_in_bytes().x();
- const int weights_stride_x = _input1->info()->strides_in_bytes().x();
- const int weights_stride_y = _input1->info()->strides_in_bytes().y();
- const int output_stride_x = _output->info()->strides_in_bytes().x();
-
- execute_window_loop(window_in, [&](const Coordinates & id)
+ if(_func != nullptr)
{
- // Get pointers
- const uint8_t *const input_ptr = in.ptr();
- const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
- auto output_ptr = reinterpret_cast<float *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
-
- float32x4_t row_dot = vdupq_n_f32(0.f);
- for(int i = 0; i < input_w; i += 4)
- {
- const auto input = vld1q_f32(reinterpret_cast<const float *>(input_ptr + i * input_stride_x));
- const auto weights = vld1q_f32(reinterpret_cast<const float *>(weights_ptr + i * weights_stride_x));
- row_dot = vaddq_f32(row_dot, vmulq_f32(input, weights));
- }
-
- auto temp = vadd_f32(vget_high_f32(row_dot), vget_low_f32(row_dot));
- temp = vpadd_f32(temp, temp);
-
- *output_ptr = vget_lane_f32(temp, 0);
- },
- in, in2, out);
+ (this->*_func)(window_in, window_weights, window_out);
+ }
}