aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichele Di Giorgio <michele.digiorgio@arm.com>2019-12-20 10:02:17 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-12-20 12:20:23 +0000
commitdeb3ac461b2b8a4f85ff91b422b6e0ada3be1300 (patch)
tree028de69384aa3490d0bfac6a76d01acd19618a1b
parent0e4556ce45cb857ac44354df73f8228475f3ad3c (diff)
downloadComputeLibrary-deb3ac461b2b8a4f85ff91b422b6e0ada3be1300.tar.gz
COMPMID-2807: Add support for QASYMM8_SIGNED in NEGEMMMatrixVectorMultiplyKernel
Change-Id: I8d33969dfc61c9a3793954cc12d22f24fb9117f0 Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-on: https://review.mlplatform.org/c/2513 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h8
-rw-r--r--src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp104
2 files changed, 94 insertions, 18 deletions
diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
index 63b42aae26..f5635dd58c 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.h
@@ -50,17 +50,17 @@ public:
NEGEMMMatrixVectorMultiplyKernel &operator=(NEGEMMMatrixVectorMultiplyKernel &&) = default;
/** Initialise the kernel's input and output.
*
- * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32
+ * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
* @param[in] input1 Second Input tensor. Data types supported: same as @p input.
- * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
+ * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
*/
void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
/** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixVectorMultiplyKernel
*
- * @param[in] input0 First Input tensor. Data types supported: QASYMM8/F16/F32
+ * @param[in] input0 First Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
* @param[in] input1 Second Input tensor. Data types supported: same as @p input.
- * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8 input.
+ * @param[in] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input, S32 for QASYMM8/QASYMM8_SIGNED input.
*
* @return a status
*/
diff --git a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
index 0e77ead72b..cf8411c55f 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixVectorMultiplyKernel.cpp
@@ -38,18 +38,23 @@
#include <cstdint>
#include <tuple>
-using namespace arm_compute;
-
+namespace arm_compute
+{
namespace
{
Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input0);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input0->data_type()) && (output->data_type() != DataType::S32));
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_float(input0->data_type()) && (output->data_type() != input0->data_type()));
+ if(is_data_type_quantized_asymmetric(input0->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output);
+ }
ARM_COMPUTE_RETURN_ERROR_ON(input0->num_dimensions() == input1->num_dimensions());
ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(2) != input1->dimension(1));
@@ -87,8 +92,6 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply(const Window &wind
ARM_COMPUTE_UNUSED(window_out);
}
-namespace arm_compute
-{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
template <>
void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>(const Window &window_in,
@@ -242,7 +245,79 @@ void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t,
},
in, in2, out);
}
-} //namespace arm_compute
+
+template <>
+void NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<int8_t, int8_t, int32_t>(const Window &window_in,
+ const Window &window_w,
+ const Window &window_out)
+{
+ Iterator in(_input0, window_in);
+ Iterator in2(_input1, window_w);
+ Iterator out(_output, window_out);
+
+ const int input_offset = -_input0->info()->quantization_info().uniform().offset;
+ const int weights_offset = -_input1->info()->quantization_info().uniform().offset;
+
+ const int input_w = _input0->info()->dimension(0);
+ const int input_h = _input0->info()->dimension(1);
+ const int input_stride_x = _input0->info()->strides_in_bytes().x();
+ const int weights_stride_x = _input1->info()->strides_in_bytes().x();
+ const int weights_stride_y = _input1->info()->strides_in_bytes().y();
+ const int output_stride_x = _output->info()->strides_in_bytes().x();
+ const int read_step = 16 / _input0->info()->element_size();
+
+ const int32x4_t v_input_offset = vdupq_n_s32(input_offset);
+ const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
+
+ execute_window_loop(window_in, [&](const Coordinates & id)
+ {
+ // Get pointers
+ const uint8_t *const input_ptr = in.ptr();
+ const uint8_t *const weights_ptr = in2.ptr() + id.z() * weights_stride_y;
+ auto output_ptr = reinterpret_cast<int32_t *>(out.ptr() + (id.y() + id.z() * input_h) * output_stride_x);
+
+ int32x4_t row_dot = vdupq_n_s32(0);
+ for(int i = 0; i < input_w; i += read_step)
+ {
+ // Read values
+ const auto input = vld1q_s8(reinterpret_cast<const int8_t *>(input_ptr + i * input_stride_x));
+ const auto weights = vld1q_s8(reinterpret_cast<const int8_t *>(weights_ptr + i * weights_stride_x));
+
+ // Add offsets
+ const int32x4x4_t input_s32 =
+ {
+ {
+ vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_low_s8(input)))),
+ vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_low_s8(input)))),
+ vaddw_s16(v_input_offset, vget_low_s16(vmovl_s8(vget_high_s8(input)))),
+ vaddw_s16(v_input_offset, vget_high_s16(vmovl_s8(vget_high_s8(input))))
+ }
+ };
+ const int32x4x4_t weights_s32 =
+ {
+ {
+ vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_low_s8(weights)))),
+ vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_low_s8(weights)))),
+ vaddw_s16(v_weights_offset, vget_low_s16(vmovl_s8(vget_high_s8(weights)))),
+ vaddw_s16(v_weights_offset, vget_high_s16(vmovl_s8(vget_high_s8(weights))))
+ }
+ };
+
+ // Dot
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[0], weights_s32.val[0]));
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[1], weights_s32.val[1]));
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[2], weights_s32.val[2]));
+ row_dot = vaddq_s32(row_dot, vmulq_s32(input_s32.val[3], weights_s32.val[3]));
+ }
+
+ // Reduction
+ auto temp = vadd_s32(vget_high_s32(row_dot), vget_low_s32(row_dot));
+ temp = vpadd_s32(temp, temp);
+
+ *output_ptr = vget_lane_s32(temp, 0);
+ },
+ in, in2, out);
+}
NEGEMMMatrixVectorMultiplyKernel::NEGEMMMatrixVectorMultiplyKernel()
: _func(nullptr), _input0(nullptr), _input1(nullptr), _output(nullptr), _border_size(0)
@@ -257,7 +332,6 @@ BorderSize NEGEMMMatrixVectorMultiplyKernel::border_size() const
void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));
_input0 = input0;
@@ -270,6 +344,9 @@ void NEGEMMMatrixVectorMultiplyKernel::configure(const ITensor *input0, const IT
case DataType::QASYMM8:
_func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<uint8_t, uint8_t, int32_t>;
break;
+ case DataType::QASYMM8_SIGNED:
+ _func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<int8_t, int8_t, int32_t>;
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
_func = &NEGEMMMatrixVectorMultiplyKernel::matrix_vector_multiply<half, half, half>;
@@ -306,6 +383,7 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_func == nullptr);
Window window_slice = window.first_slice_window_3D();
@@ -327,8 +405,6 @@ void NEGEMMMatrixVectorMultiplyKernel::run(const Window &window, const ThreadInf
window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
window_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
- if(_func != nullptr)
- {
- (this->*_func)(window_in, window_weights, window_out);
- }
+ (this->*_func)(window_in, window_weights, window_out);
}
+} // namespace arm_compute