aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp136
-rw-r--r--src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp4
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp5
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp438
-rw-r--r--src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp275
-rw-r--r--src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp2
-rw-r--r--src/core/NEON/kernels/NEWeightsReshapeKernel.cpp2
-rw-r--r--src/core/Utils.cpp4
-rw-r--r--src/core/utils/quantization/AsymmHelpers.cpp38
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp16
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp88
-rw-r--r--src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp7
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp148
13 files changed, 876 insertions, 287 deletions
diff --git a/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
new file mode 100644
index 0000000000..39e030e434
--- /dev/null
+++ b/src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.cpp
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+
+ // Validate output if initialized
+ if(output->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
+ }
+
+ return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+ // Output auto inizialitation if not yet initialized
+ {
+ const bool is_input_signed = input->data_type() == DataType::QASYMM8_SIGNED;
+ const DataType dt = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo qinfo = input->quantization_info().uniform();
+ const int offset_correction = is_input_signed ? -128 : 128;
+ const QuantizationInfo corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+
+ auto_init_if_empty(*output, input->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
+ }
+
+ return std::make_pair(Status{}, calculate_max_window(*output));
+}
+} // namespace
+
+NEConvertQuantizedSignednessKernel::NEConvertQuantizedSignednessKernel()
+ : _input(nullptr), _output(nullptr)
+{
+}
+
+void NEConvertQuantizedSignednessKernel::configure(const ITensor *input, ITensor *output)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+
+ _input = input;
+ _output = output;
+
+ std::pair<Status, Window> win_config = validate_and_configure_window(input->info(), output->info());
+ ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+ INEKernel::configure(win_config.second);
+}
+
+Status NEConvertQuantizedSignednessKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+ return Status{};
+}
+
+void NEConvertQuantizedSignednessKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+ win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator input(_input, win_collapsed);
+ Iterator output(_output, win_collapsed);
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ const uint8_t mask = 128;
+ const auto vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
+
+ execute_window_loop(win_collapsed, [&](const Coordinates &)
+ {
+ const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
+ const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+ // Compute S elements per iteration
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ const auto vin = wrapper::vloadq(input_ptr + x);
+ wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+ }
+
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ const uint8_t in = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+ *(output_ptr + x) = in ^ mask;
+ }
+ },
+ input, output);
+}
+} // namespace arm_compute
diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
index c929983162..a9c04824ae 100644
--- a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp
@@ -45,9 +45,7 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
- DataType::U16, DataType::S16, DataType::U32, DataType::S32,
- DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
if(output->total_size() != 0)
diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
index 6cec51d5a2..8f5a208cbb 100644
--- a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp
@@ -722,8 +722,8 @@ namespace
{
Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::S8, DataType::U8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
TensorShape in0_shape = input0->tensor_shape();
@@ -917,6 +917,7 @@ void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo
switch(_input0->info()->data_type())
{
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
{
matrix_multiply_s8(ina, inb, out, width_b, out_stride, window);
break;
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index 46e53cec12..3ada3a3c4f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -72,6 +72,58 @@ inline int32x4x4_t load(const int32_t *ptr, int32_t x)
};
}
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
+{
+ return
+ {
+ {
+ vaddq_s32(a.val[0], b),
+ vaddq_s32(a.val[1], b),
+ vaddq_s32(a.val[2], b),
+ vaddq_s32(a.val[3], b)
+ }
+ };
+}
+
+inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
+{
+ return
+ {
+ {
+ vaddq_s32(a.val[0], b.val[0]),
+ vaddq_s32(a.val[1], b.val[1]),
+ vaddq_s32(a.val[2], b.val[2]),
+ vaddq_s32(a.val[3], b.val[3])
+ }
+ };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
+{
+ return
+ {
+ {
+ vmulq_n_s32(a.val[0], mul_scalar),
+ vmulq_n_s32(a.val[1], mul_scalar),
+ vmulq_n_s32(a.val[2], mul_scalar),
+ vmulq_n_s32(a.val[3], mul_scalar)
+ }
+ };
+}
+
+inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
+{
+ return
+ {
+ {
+ vmulq_s32(a.val[0], vld1q_s32(multilpier)),
+ vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+ vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
+ vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
+ }
+ };
+}
+
inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
{
int32x4x4_t a_offset_term_s32 = load(vector_sum_col_ptr, x);
@@ -141,6 +193,82 @@ inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int3
return out_u8;
}
+template <bool is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+ const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+ // Shift final result (negative value shift right)
+ in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+ in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+ in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+ in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+ // Saturate negative values
+ in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+ in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+ in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+ in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to S8
+ int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_s8 = vmaxq_s8(out_s8, min_s8);
+ out_s8 = vminq_s8(out_s8, max_s8);
+ }
+
+ return out_s8;
+}
+
+template <bool is_bounded_relu>
+inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8)
+{
+ const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+ // Shift final result (negative value shift right)
+ in_s32.val[0] = vshlq_s32(in_s32.val[0], vnegq_s32(result_shift_s32.val[0]));
+ in_s32.val[1] = vshlq_s32(in_s32.val[1], vnegq_s32(result_shift_s32.val[1]));
+ in_s32.val[2] = vshlq_s32(in_s32.val[2], vnegq_s32(result_shift_s32.val[2]));
+ in_s32.val[3] = vshlq_s32(in_s32.val[3], vnegq_s32(result_shift_s32.val[3]));
+
+ // Saturate negative values
+ in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+ in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+ in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+ in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+ // Convert S32 to S16
+ const int16x8x2_t in_s16 =
+ {
+ {
+ vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+ vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+ }
+ };
+
+ // Convert S16 to S8
+ int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+ if(is_bounded_relu)
+ {
+ out_s8 = vmaxq_s8(out_s8, min_s8);
+ out_s8 = vminq_s8(out_s8, max_s8);
+ }
+
+ return out_s8;
+}
+
inline Window get_win_vector_sum(const Window &window)
{
Window win_vector_sum(window);
@@ -172,50 +300,12 @@ inline Iterator get_bias_it(const Window &window, const ITensor *bias)
return bias_it;
}
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
-{
- return
- {
- {
- vaddq_s32(a.val[0], b),
- vaddq_s32(a.val[1], b),
- vaddq_s32(a.val[2], b),
- vaddq_s32(a.val[3], b)
- }
- };
-}
-
-inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
-{
- return
- {
- {
- vaddq_s32(a.val[0], b.val[0]),
- vaddq_s32(a.val[1], b.val[1]),
- vaddq_s32(a.val[2], b.val[2]),
- vaddq_s32(a.val[3], b.val[3])
- }
- };
-}
-
-inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
-{
- return
- {
- {
- vmulq_n_s32(a.val[0], mul_scalar),
- vmulq_n_s32(a.val[1], mul_scalar),
- vmulq_n_s32(a.val[2], mul_scalar),
- vmulq_n_s32(a.val[3], mul_scalar)
- }
- };
-}
-
template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
int32_t a_offset, int32_t b_offset, int32_t k_offset,
- GEMMLowpOutputStageInfo output_stage, int window_step_x, int window_start_x, int window_end_x)
+ int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
+ int window_step_x, int window_start_x, int window_end_x)
{
int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
if(!is_fixed_point)
@@ -251,12 +341,12 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
}
if(!is_fixed_point)
{
- in_s32 = mul_s32(in_s32, output_stage.gemmlowp_multiplier);
+ in_s32 = mul_s32(in_s32, multiplier);
}
if(is_fixed_point)
{
- vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, result_offset_s32, min_u8, max_u8));
+ vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8));
}
else
{
@@ -280,24 +370,99 @@ inline void run_offset_contribution_output_stage_window(const int32_t *vector_su
if(is_fixed_point)
{
// Finalize and store the result
- *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift,
- output_stage.gemmlowp_offset, static_cast<uint8_t>(output_stage.gemmlowp_min_bound), static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+ *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, static_cast<uint8_t>(min_bound), static_cast<uint8_t>(max_bound));
}
else
{
// Finalize quantization
- in_value = (in_value * output_stage.gemmlowp_multiplier) >> output_stage.gemmlowp_shift;
+ in_value = (in_value * multiplier) >> shift;
// Bound and store the result
if(is_bounded_relu)
{
- in_value = static_cast<uint8_t>(std::max<int32_t>(output_stage.gemmlowp_min_bound, std::min<int32_t>(output_stage.gemmlowp_max_bound, in_value)));
+ in_value = static_cast<uint8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
}
*(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
}
}
}
+template <bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
+ const int32_t *result_multipliers, const int32_t *result_shifts,
+ const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
+ int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
+ int window_step_x, int window_start_x, int window_end_x)
+{
+ int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
+ if(!is_fixed_point)
+ {
+ // Combine quantization offset with other offsets.
+ offset_term_s32 = add_s32(offset_term_s32, result_offset);
+ }
+
+ int x = window_start_x;
+ for(; x <= (window_end_x - window_step_x); x += window_step_x)
+ {
+ int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
+
+ if(has_a_offset)
+ {
+ in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
+ }
+ if(has_bias)
+ {
+ in_s32 = add_s32(in_s32, load(bias_ptr, x));
+ }
+ if(!is_fixed_point)
+ {
+ in_s32 = add_s32(in_s32, offset_term_s32);
+ in_s32 = mul_s32(in_s32, result_multipliers + x);
+ }
+
+ if(is_fixed_point)
+ {
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm<is_bounded_relu>(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8));
+ }
+ else
+ {
+ vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point<is_bounded_relu>(in_s32, load(result_shifts, x), min_s8, max_s8));
+ }
+ }
+ // Compute left-over elements
+ for(; x < window_end_x; ++x)
+ {
+ int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+
+ if(has_a_offset)
+ {
+ in_value += (*(vector_sum_col_ptr + x) * a_offset);
+ }
+ if(has_bias)
+ {
+ in_value += *(bias_ptr + x);
+ }
+
+ if(is_fixed_point)
+ {
+ // Finalize and store the result
+ *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound));
+ }
+ else
+ {
+ // Finalize quantization
+ in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
+
+ // Bound and store the result
+ if(is_bounded_relu)
+ {
+ in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+ }
+ *(out_it.ptr() + x) = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+ }
+ }
+}
+
template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
void run_offset_contribution_output_stage(const Window &window,
const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
@@ -307,10 +472,16 @@ void run_offset_contribution_output_stage(const Window &window,
const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
- const int32x4_t result_offset_s32 = vdupq_n_s32(output_stage.gemmlowp_offset);
- const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? output_stage.gemmlowp_shift : -output_stage.gemmlowp_shift);
- const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_min_bound));
- const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(output_stage.gemmlowp_max_bound));
+ const int32_t multiplier = output_stage.gemmlowp_multiplier;
+ const int32_t shift = output_stage.gemmlowp_shift;
+ const int32_t offset = output_stage.gemmlowp_offset;
+ const int32_t min_bound = output_stage.gemmlowp_min_bound;
+ const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+ const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
+ const int32x4_t result_shift_s32 = vdupq_n_s32(is_fixed_point ? shift : -shift);
+ const uint8x16_t min_u8 = vdupq_n_u8(static_cast<uint8_t>(min_bound));
+ const uint8x16_t max_u8 = vdupq_n_u8(static_cast<uint8_t>(max_bound));
const int window_step_x = 16;
const auto window_start_x = static_cast<int>(window.x().start());
@@ -349,7 +520,8 @@ void run_offset_contribution_output_stage(const Window &window,
run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
}
@@ -363,7 +535,8 @@ void run_offset_contribution_output_stage(const Window &window,
+ id.y() + (id.z() % depth_input) * height_input;
run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
}
@@ -386,7 +559,8 @@ void run_offset_contribution_output_stage(const Window &window,
+ id.y() + (id.z() % depth_input) * height_input;
run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_row_it, bias_it, mm_result_it, out_it);
}
@@ -399,7 +573,8 @@ void run_offset_contribution_output_stage(const Window &window,
+ id.y() + (id.z() % depth_input) * height_input;
run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_row_it, mm_result_it, out_it);
}
@@ -422,7 +597,8 @@ void run_offset_contribution_output_stage(const Window &window,
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, bias_it, mm_result_it, out_it);
}
@@ -434,7 +610,8 @@ void run_offset_contribution_output_stage(const Window &window,
const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
vector_sum_col_it, mm_result_it, out_it);
}
@@ -448,7 +625,8 @@ void run_offset_contribution_output_stage(const Window &window,
{
run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
bias_it, mm_result_it, out_it);
}
@@ -458,7 +636,110 @@ void run_offset_contribution_output_stage(const Window &window,
{
run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
- output_stage, window_step_x, window_start_x, window_end_x);
+ multiplier, shift, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ mm_result_it, out_it);
+ }
+ return;
+ }
+}
+
+template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+void run_offset_contribution_output_stage_symm(const Window &window,
+ const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+ int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
+ GEMMLowpOutputStageInfo output_stage)
+{
+ ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
+
+ const int depth_input = is_gemm3d ? mm_result->info()->dimension(2) : 1;
+
+ const int32_t offset = output_stage.gemmlowp_offset;
+ const int32_t min_bound = output_stage.gemmlowp_min_bound;
+ const int32_t max_bound = output_stage.gemmlowp_max_bound;
+
+ const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
+ const int32_t *result_shifts = output_stage.gemmlowp_shifts.data();
+ const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
+ const int8x16_t min_s8 = vdupq_n_s8(static_cast<int8_t>(min_bound));
+ const int8x16_t max_s8 = vdupq_n_s8(static_cast<int8_t>(max_bound));
+
+ const int window_step_x = 16;
+ const auto window_start_x = static_cast<int>(window.x().start());
+ const auto window_end_x = static_cast<int>(window.x().end());
+
+ Window win(window);
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Window collapsed_window = win.collapse_if_possible(win, Window::DimZ);
+
+ Iterator mm_result_it(mm_result, win);
+ Iterator out_it(output, win);
+
+ if(a_offset != 0)
+ {
+ ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
+
+ Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
+
+ // Offset in case vector_sum_col is batched
+ const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ run_offset_contribution_output_stage_window_symm<true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates & id)
+ {
+ const int batch_id = id.z() / depth_input;
+ const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
+ run_offset_contribution_output_stage_window_symm<true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ vector_sum_col_it, mm_result_it, out_it);
+ }
+ }
+ else
+ {
+ if(bias != nullptr)
+ {
+ Iterator bias_it = get_bias_it(collapsed_window, bias);
+ execute_window_loop(collapsed_window, [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window_symm<false, true, is_bounded_relu, is_fixed_point>(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
+ },
+ bias_it, mm_result_it, out_it);
+ }
+ else
+ {
+ execute_window_loop(collapsed_window, [&](const Coordinates &)
+ {
+ run_offset_contribution_output_stage_window_symm<false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, mm_result_it, out_it,
+ result_multipliers, result_shifts,
+ result_offset_s32, min_s8, max_s8,
+ a_offset, offset, min_bound, max_bound,
+ window_step_x, window_start_x, window_end_x);
},
mm_result_it, out_it);
}
@@ -470,8 +751,18 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
- ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+ if(output->data_type() == DataType::QASYMM8)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 127);
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < -128);
+ ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
if(bias != nullptr)
@@ -525,7 +816,7 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto
if(output->total_size() != 0)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
}
@@ -551,7 +842,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result,
}
NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
-get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, GEMMLowpOutputStageInfo output_stage)
+get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
{
static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
{
@@ -562,7 +853,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
{ 4, &run_offset_contribution_output_stage<false, false, true> },
{ 5, &run_offset_contribution_output_stage<true, false, true> },
{ 6, &run_offset_contribution_output_stage<false, true, true> },
- { 7, &run_offset_contribution_output_stage<true, true, true> }
+ { 7, &run_offset_contribution_output_stage_symm<true, true, true> },
+ { 8, &run_offset_contribution_output_stage_symm<false, false, false> },
+ { 9, &run_offset_contribution_output_stage_symm<true, false, false> },
+ { 10, &run_offset_contribution_output_stage_symm<false, true, false> },
+ { 11, &run_offset_contribution_output_stage_symm<true, true, false> },
+ { 12, &run_offset_contribution_output_stage_symm<false, false, true> },
+ { 13, &run_offset_contribution_output_stage_symm<true, false, true> },
+ { 14, &run_offset_contribution_output_stage_symm<false, true, true> },
+ { 15, &run_offset_contribution_output_stage_symm<true, true, true> }
};
// Check if input is a 3D reinterpretation
@@ -574,11 +873,15 @@ get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row,
const bool is_bounded_relu = ((output_stage.gemmlowp_min_bound != output_stage.gemmlowp_max_bound)
&& !(output_stage.gemmlowp_min_bound == 0 && output_stage.gemmlowp_max_bound == 255));
+ // Check if we need to perform fixed point requantization
const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
+ // Check if symmetric per-channel execution
+ const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
// key acts as a bitset, setting the first bit on reinterpret_as_3d,
// the second on is_bounded_relu, and the third on is_fixed_point.
- uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+ uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3);
return map_function.find(key)->second;
}
} // namespace
@@ -591,8 +894,9 @@ NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutpu
}
void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_result, const ITensor *vector_sum_col,
- const ITensor *vector_sum_row, const ITensor *bias, ITensor *output, int32_t k,
- int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+ const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
+ int32_t k, int32_t a_offset, int32_t b_offset,
+ GEMMLowpOutputStageInfo output_stage)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
@@ -627,7 +931,7 @@ void NEGEMMLowpOffsetContributionOutputStageKernel::configure(const ITensor *mm_
ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
INEKernel::configure(win_config.second);
- _function = get_configured_function(mm_result, vector_sum_row, output_stage);
+ _function = get_configured_function(mm_result, vector_sum_row, output, output_stage);
}
Status NEGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
diff --git a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
index c1ee770db5..72632492d7 100644
--- a/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,13 +27,13 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/wrapper/wrapper.h"
#include "arm_compute/core/TensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/Window.h"
-#include <arm_neon.h>
#include <cstddef>
#include <cstdint>
@@ -48,7 +48,7 @@ namespace
{
Status validate_arguments_matrix_a_reduction(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
return Status{};
@@ -72,7 +72,7 @@ std::pair<Status, Window> validate_and_configure_window_matrix_a_reduction(ITens
Status validate_arguments_matrix_b_reduction(const ITensorInfo *input, const ITensorInfo *output)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
return Status{};
@@ -128,11 +128,12 @@ Status NEGEMMLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, cons
return Status{};
}
-void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixAReductionKernel::run_internal(const arm_compute::Window &window)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ // Intermediate and final accumulator types
+ using TIAcc = wrapper::traits::promote_t<T>;
+ using TAcc = wrapper::traits::promote_t<TIAcc>;
Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
@@ -149,9 +150,9 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4_t sum_row = vdupq_n_u32(0);
+ auto sum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
- const uint8_t *matrix_a = (in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+ const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + (id.x() / 4) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -161,43 +162,41 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
// This for loop performs 4 accumulations
for(; i <= (_k - 4); i += 4)
{
- const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i * 4);
+ const auto a0_d8 = wrapper::vloadq(matrix_a + i * 4);
- // Convert U8 to U16
- uint16x4x4_t a0_u16 =
+ // Convert 8-bit to 16-bit
+ typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W64>::type a0_d16[4] =
{
- {
- vget_low_u16(vmovl_u8(vget_low_u8(a0_u8))),
- vget_high_u16(vmovl_u8(vget_low_u8(a0_u8))),
- vget_low_u16(vmovl_u8(vget_high_u8(a0_u8))),
- vget_high_u16(vmovl_u8(vget_high_u8(a0_u8)))
- }
+ wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+ wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a0_d8))),
+ wrapper::vgetlow(wrapper::vmovl((wrapper::vgethigh(a0_d8)))),
+ wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a0_d8)))
};
- // Accumulate to U16
- a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[1]);
- a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[2]);
- a0_u16.val[0] = vadd_u16(a0_u16.val[0], a0_u16.val[3]);
+ // Accumulate to 16-bit
+ a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[1]);
+ a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[2]);
+ a0_d16[0] = wrapper::vadd(a0_d16[0], a0_d16[3]);
- // Accumulate to U32
- sum_row = vaddw_u16(sum_row, a0_u16.val[0]);
+ // Accumulate to 32-bit
+ sum_row = wrapper::vaddw(sum_row, a0_d16[0]);
}
// This for loop performs the leftover accumulations
for(; i < _k; ++i)
{
- const uint8x8_t a0_u8 = vld1_u8(matrix_a + i * 4);
+ const auto a0_d8 = wrapper::vload(matrix_a + i * 4);
// Convert U8 to U16
- const uint16x4_t a0_u16 = vget_low_u16(vmovl_u8(a0_u8));
+ const auto a0_d16 = wrapper::vgetlow(wrapper::vmovl(a0_d8));
// Accumulate to U32
- sum_row = vaddw_u16(sum_row, a0_u16);
+ sum_row = wrapper::vaddw(sum_row, a0_d16);
}
auto vector_sum_row = reinterpret_cast<int32_t *>(out.ptr());
- vst1q_s32(vector_sum_row, vreinterpretq_s32_u32(sum_row));
+ wrapper::vstore(vector_sum_row, wrapper::vreinterpret_s32(sum_row));
},
in, out);
}
@@ -206,10 +205,10 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4_t sum_row_u32 = vdupq_n_u32(0);
- uint32_t sum_row = 0;
+ auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+ TAcc sum_row = 0;
- const uint8_t *matrix_a = (in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
+ const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]));
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
@@ -219,37 +218,57 @@ void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInf
// This for loop performs 16 accumulations
for(; i <= (_k - 16); i += 16)
{
- const uint8x16_t a0_u8 = vld1q_u8(matrix_a + i);
+ const auto a0_d8 = wrapper::vloadq(matrix_a + i);
// Partial accumulations in U16
- const uint16x8_t tmp_sum0 = vaddl_u8(vget_low_u8(a0_u8), vget_high_u8(a0_u8));
+ const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
// Accumulate to U32
- sum_row_u32 = vaddq_u32(sum_row_u32, vpaddlq_u16(tmp_sum0));
+ vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
}
// This for loop performs the leftover accumulations
for(; i < _k; ++i)
{
- sum_row += static_cast<uint32_t>(matrix_a[i]);
+ sum_row += static_cast<TAcc>(matrix_a[i]);
}
#if defined(__aarch64__)
// Reduction operation available on 64 bit architectures only
- sum_row += vaddvq_u32(sum_row_u32);
+ sum_row += wrapper::vaddv(vsum_row);
#else // __aarch64__
- uint32x2_t tmp = vpadd_u32(vget_high_u32(sum_row_u32), vget_low_u32(sum_row_u32));
- tmp = vpadd_u32(tmp, tmp);
+ auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+ tmp = wrapper::vpadd(tmp, tmp);
- sum_row += vget_lane_u32(tmp, 0);
+ sum_row += wrapper::vgetlane(tmp, 0);
#endif // __aarch64__
- *(reinterpret_cast<int *>(out.ptr())) = static_cast<int>(sum_row);
+ *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
},
in, out);
}
}
+void NEGEMMLowpMatrixAReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ run_internal<uint8_t>(window);
+ break;
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ run_internal<int8_t>(window);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+}
+
void NEGEMMLowpMatrixBReductionKernel::configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
@@ -276,11 +295,12 @@ Status NEGEMMLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, cons
return Status{};
}
-void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+template <typename T>
+void NEGEMMLowpMatrixBReductionKernel::run_internal(const Window &window, const ThreadInfo &info)
{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+ // Intermediate and final accumulator types
+ using TIAcc = wrapper::traits::promote_t<T>;
+ using TAcc = wrapper::traits::promote_t<TIAcc>;
Window collapsed_window = window.collapse_if_possible(IKernel::window(), Window::DimY);
@@ -297,17 +317,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
execute_window_loop(collapsed_window, [&](const Coordinates & id)
{
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4x4_t sum_col =
+ typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
{
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
};
- const uint8_t *matrix_b = in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2];
+ const auto *matrix_b = reinterpret_cast<const T *>(in.ptr() + (id.x() / 16) * _input->info()->strides_in_bytes()[1] + id.y() * _input->info()->strides_in_bytes()[2]);
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -316,35 +334,28 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
int i = 0;
for(; i < _k; ++i)
{
- const uint8x16_t b0_u8 = vld1q_u8(matrix_b + i * 16);
+ const auto b0_b8 = wrapper::vloadq(matrix_b + i * 16);
- // Convert S8 to U16
- const uint16x8x2_t b0_u16 =
+ // Convert 8bit to 16bit
+ const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2] =
{
- {
- vmovl_u8(vget_low_u8(b0_u8)),
- vmovl_u8(vget_high_u8(b0_u8))
- }
+ wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+ wrapper::vmovl(wrapper::vgethigh(b0_b8))
};
// Accumulate to U32
- sum_col =
- {
- {
- vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
- vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
- }
- };
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
}
auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
- vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
- vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
- vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
- vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+ wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+ wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+ wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+ wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
},
in, out);
}
@@ -377,17 +388,15 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
}
// Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
- uint32x4x4_t sum_col =
+ typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
{
- {
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0),
- vdupq_n_u32(0)
- }
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
};
- const uint8_t *matrix_b = inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2];
+ const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * _input->info()->strides_in_bytes()[2]);
#if __arm__
asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
@@ -398,10 +407,10 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
// This for loop performs 4 accumulations
for(; i <= (_k - 4); i += 4)
{
- const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
- const uint8x16_t b1_u8 = vld1q_u8(matrix_b + 1 * in_b_stride);
- const uint8x16_t b2_u8 = vld1q_u8(matrix_b + 2 * in_b_stride);
- const uint8x16_t b3_u8 = vld1q_u8(matrix_b + 3 * in_b_stride);
+ const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+ const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+ const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+ const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
#if __arm__
asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
@@ -410,34 +419,27 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
#endif /* __arm__ */
- // Partial accumulation in u16
- uint16x8x2_t tmp_sum =
+ // Partial accumulation in 16bit
+ typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
{
- {
- vdupq_n_u16(0),
- vdupq_n_u16(0)
- }
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+ wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
};
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b0_u8));
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b1_u8));
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b2_u8));
- tmp_sum.val[0] = vaddw_u8(tmp_sum.val[0], vget_low_u8(b3_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b0_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b1_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b2_u8));
- tmp_sum.val[1] = vaddw_u8(tmp_sum.val[1], vget_high_u8(b3_u8));
-
- // Accumulate to U32
- sum_col =
- {
- {
- vaddw_u16(sum_col.val[0], vget_low_u16(tmp_sum.val[0])),
- vaddw_u16(sum_col.val[1], vget_high_u16(tmp_sum.val[0])),
- vaddw_u16(sum_col.val[2], vget_low_u16(tmp_sum.val[1])),
- vaddw_u16(sum_col.val[3], vget_high_u16(tmp_sum.val[1]))
- }
- };
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+ tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+ tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
matrix_b += 4 * in_b_stride;
}
@@ -445,38 +447,51 @@ void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInf
// This for loop perfoms the leftover accumulations
for(; i < _k; ++i)
{
- const uint8x16_t b0_u8 = vld1q_u8(matrix_b + 0 * in_b_stride);
+ const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
// Convert S8 to S16
- const uint16x8x2_t b0_u16 =
+ const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
{
- {
- vmovl_u8(vget_low_u8(b0_u8)),
- vmovl_u8(vget_high_u8(b0_u8))
- }
+ wrapper::vmovl(wrapper::vgetlow(b0_b8)),
+ wrapper::vmovl(wrapper::vgethigh(b0_b8))
};
- // Accumulate to U32
- sum_col =
- {
- {
- vaddw_u16(sum_col.val[0], vget_low_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[1], vget_high_u16(b0_u16.val[0])),
- vaddw_u16(sum_col.val[2], vget_low_u16(b0_u16.val[1])),
- vaddw_u16(sum_col.val[3], vget_high_u16(b0_u16.val[1]))
- }
- };
+ // Accumulate to 32bit
+ sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+ sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+ sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+ sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
matrix_b += in_b_stride;
}
auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
- vst1q_s32(vector_sum_col + 0, vreinterpretq_s32_u32(sum_col.val[0]));
- vst1q_s32(vector_sum_col + 4, vreinterpretq_s32_u32(sum_col.val[1]));
- vst1q_s32(vector_sum_col + 8, vreinterpretq_s32_u32(sum_col.val[2]));
- vst1q_s32(vector_sum_col + 12, vreinterpretq_s32_u32(sum_col.val[3]));
+ wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret_s32(sum_col[0]));
+ wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret_s32(sum_col[1]));
+ wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret_s32(sum_col[2]));
+ wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret_s32(sum_col[3]));
},
inb, out);
}
}
+
+void NEGEMMLowpMatrixBReductionKernel::run(const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+ switch(_input->info()->data_type())
+ {
+ case DataType::QASYMM8:
+ run_internal<uint8_t>(window, info);
+ break;
+ case DataType::QASYMM8_SIGNED:
+ case DataType::QSYMM8_PER_CHANNEL:
+ run_internal<int8_t>(window, info);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Unsupported data type");
+ }
+}
diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
index 0ca7fd3dc8..ea3d32e628 100644
--- a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp
@@ -55,7 +55,7 @@ TensorShape get_output_shape(const ITensorInfo *input)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::U8, DataType::S8,
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::U8, DataType::S8,
DataType::U16, DataType::S16, DataType::U32, DataType::S32,
DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 624833adfb..649316442e 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -49,7 +49,7 @@ TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
{
//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
if(biases != nullptr)
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index fa335d757b..6d276d1322 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -162,6 +162,7 @@ const std::string &arm_compute::string_from_data_type(DataType dt)
{ DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
{ DataType::QASYMM8_PER_CHANNEL, "QASYMM8_PER_CHANNEL" },
{ DataType::QASYMM8, "QASYMM8" },
+ { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
{ DataType::QSYMM16, "QSYMM16" },
{ DataType::QASYMM16, "QASYMM16" },
};
@@ -292,6 +293,7 @@ std::string arm_compute::string_from_pixel_value(const PixelValue &value, const
converted_string = ss.str();
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
// Needs conversion to 32 bit, otherwise interpreted as ASCII values
ss << int32_t(value.get<int8_t>());
@@ -448,6 +450,7 @@ void arm_compute::print_consecutive_elements(std::ostream &s, DataType dt, const
print_consecutive_elements_impl<uint8_t>(s, ptr, n, stream_width, element_delim);
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
break;
@@ -485,6 +488,7 @@ int arm_compute::max_consecutive_elements_display_width(std::ostream &s, DataTyp
case DataType::QASYMM8_PER_CHANNEL:
return max_consecutive_elements_display_width_impl<uint8_t>(s, ptr, n);
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
case DataType::QSYMM8_PER_CHANNEL:
return max_consecutive_elements_display_width_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n);
case DataType::U16:
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index cdd48972eb..386d75eca2 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -108,6 +108,44 @@ Status calculate_quantized_multiplier_greater_than_one(float multiplier,
return Status{};
}
+
+arm_compute::Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info,
+ const QuantizationInfo &wq_info,
+ const QuantizationInfo &oq_info,
+ GEMMLowpOutputStageInfo &stage_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(wq_info.scale().empty());
+ ARM_COMPUTE_RETURN_ERROR_ON(oq_info.scale().empty());
+
+ const unsigned int size = wq_info.scale().size();
+
+ auto &quant_multipliers = stage_info.gemmlowp_multipliers;
+ auto &quant_shifts = stage_info.gemmlowp_shifts;
+ quant_multipliers.resize(size);
+ quant_shifts.resize(size);
+
+ const auto &w_scales = wq_info.scale();
+ const float i_scale = iq_info.scale().at(0);
+ const float o_scale = oq_info.scale().at(0);
+
+ for(unsigned int i = 0; i < size; ++i)
+ {
+ const float multiplier = i_scale * w_scales[i] / o_scale;
+ int quant_multiplier = 0;
+ int quant_shift = 0;
+ ARM_COMPUTE_RETURN_ON_ERROR(calculate_quantized_multiplier_less_than_one(multiplier, &quant_multiplier, &quant_shift));
+ quant_multipliers[i] = quant_multiplier;
+ quant_shifts[i] = quant_shift;
+ }
+
+ // Legacy part
+ stage_info.gemmlowp_shift = quant_shifts[0];
+ stage_info.gemmlowp_multiplier = quant_multipliers[0];
+
+ return Status{};
+}
+
std::pair<int, int> get_min_max_values_from_quantized_data_type(DataType data_type)
{
int min_quant_val = 0;
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index b31ecb91e9..43e531579a 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -450,13 +450,24 @@ Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo
#ifndef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 || a->data_type() == DataType::S8 || a->data_type() == DataType::QASYMM8, "8bit integer types only supported for aarch64");
#endif /* __aarch64__ */
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::U8, DataType::QASYMM8, DataType::S8, DataType::F16);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
+ DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+ DataType::F16, DataType::F32);
+ if(is_data_type_quantized_per_channel(b->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+ }
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && d->data_type() != DataType::QASYMM8, "Only QASYMM8 output supported for QASYMM8 input");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8_SIGNED && d->data_type() != DataType::S32, "Only S32 output supported for QASYMM8_SIGNED input");
return Status{};
}
@@ -495,6 +506,7 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const
}
break;
case DataType::S8:
+ case DataType::QASYMM8_SIGNED:
create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
break;
#endif /* __aarch64__ */
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index f4377cdaf2..caff117e09 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -59,7 +59,7 @@ void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const I
Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
if(biases != nullptr)
@@ -114,18 +114,18 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->info()->quantization_info().uniform();
-
- input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
- weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
- const UniformQuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier;
- int output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ const QuantizationInfo iqinfo = input->info()->quantization_info();
+ const QuantizationInfo wqinfo = weights->info()->quantization_info();
+ const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
+ const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+ input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+ if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+ {
+ const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+ weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+ }
// Merge activation with output stage
int min_activation = 0;
@@ -133,26 +133,25 @@ void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *w
if(supported_acts.count(act_info.activation()) != 0)
{
- const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
- const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+ const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+ const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
}
GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = oqinfo.offset;
- output_info.gemmlowp_multiplier = output_multiplier;
- output_info.gemmlowp_shift = output_shift;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
_mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
// Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset));
- weights->info()->set_quantization_info(QuantizationInfo(wqinfo.scale, wqinfo.offset));
+ input->info()->set_quantization_info(iqinfo);
+ weights->info()->set_quantization_info(wqinfo);
}
else
{
@@ -176,20 +175,10 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
{
// Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
// Extract and negate input and weights offset
- const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
- const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
-
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(iqinfo.scale, -iqinfo.offset));
- weights_qa->set_quantization_info(QuantizationInfo(wqinfo.scale, -wqinfo.offset));
-
- const UniformQuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info().uniform();
-
- float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale;
- int output_multiplier;
- int output_shift;
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
+ const QuantizationInfo &iqinfo = input->quantization_info();
+ const QuantizationInfo &wqinfo = weights->quantization_info();
+ const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
// Merge activation with output stage
int min_activation = 0;
@@ -201,22 +190,25 @@ Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITens
};
if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
{
- const int a_const_int = quantize_qasymm8(act_info.a(), oqinfo);
- const int b_const_int = quantize_qasymm8(act_info.b(), oqinfo);
+ const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
+ const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
- min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oqinfo.offset : b_const_int;
+ min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
}
GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = oqinfo.offset;
- output_info.gemmlowp_multiplier = output_multiplier;
- output_info.gemmlowp_shift = output_shift;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
// Perform validation step on GEMMLowp
+ std::unique_ptr<ITensorInfo> input_qa = input->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info));
}
else
@@ -396,7 +388,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
diff --git a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
index 63f330be6c..a478fdd231 100644
--- a/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
+++ b/src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -26,11 +26,12 @@
#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "support/ToolchainSupport.h"
-using namespace arm_compute;
-
+namespace arm_compute
+{
void NEGEMMInterleave4x4::configure(const ITensor *input, ITensor *output)
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
k->configure(input, output);
_kernel = std::move(k);
}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 617d66cf24..01a99f7aca 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -42,9 +42,9 @@ using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
: _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(), _mtx_b_reduction_kernel(),
- _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _vector_sum_col(), _vector_sum_row(), _tmp_a(), _tmp_b(), _mm_result_s32(), _original_b(nullptr),
- _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false), _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
- _fuse_output_stage(false), _run_activation(false)
+ _offset_contribution_kernel(), _offset_contribution_output_stage_kernel(), _activation_func(), _convert_to_signed_asymm(), _convert_from_signed_asymm(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
+ _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0), _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
+ _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false), _fuse_output_stage(false), _run_activation(false), _flip_signedness(false)
{
}
@@ -56,6 +56,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
const ITensor *matrix_a = a;
const ITensor *matrix_b = b;
+ GEMMInfo info = gemm_info;
// Clear state
_mtx_a_reshape_kernel = nullptr;
@@ -65,13 +66,41 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_a_offset = a->info()->quantization_info().uniform().offset;
_b_offset = b->info()->quantization_info().uniform().offset;
_run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
- _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+ _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
_is_prepared = false;
_fused_assembly_path = false;
+ _flip_signedness = is_data_type_quantized_per_channel(b->info()->data_type()) && (a->info()->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
_original_b = b;
+ const ITensor *a_to_use = a;
+
+ // Convert to QASYMM8 -> QASYMM8_SIGNED and back
+ if(_flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->info()->quantization_info().uniform();
+
+ _signed_a.allocator()->init(a_to_use->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction)));
+ _memory_group.manage(&_signed_a);
+ _convert_to_signed_asymm.configure(a_to_use, &_signed_a);
+ a_to_use = &_signed_a;
+ _a_offset = _signed_a.info()->quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->info()->quantization_info().uniform();
+ _memory_group.manage(&_signed_output);
+ _signed_output.allocator()->init(output->info()->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction)));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = _signed_output.info()->quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+ }
+
// If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
- if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+ if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
{
_fuse_output_stage = true;
_memory_group.manage(&_mm_result_s32);
@@ -83,17 +112,18 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
switch(a->info()->data_type())
{
case DataType::QASYMM8:
+ case DataType::QASYMM8_SIGNED:
case DataType::U8:
case DataType::S8:
{
- if(a->info()->data_type() == DataType::QASYMM8 && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+ if(a_to_use->info()->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- _asm_glue.configure(a, b, c, output, gemm_info);
+ _asm_glue.configure(a_to_use, b, c, output, gemm_info);
_fused_assembly_path = _asm_glue.is_configured();
}
else
{
- _asm_glue.configure(a, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
+ _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
}
_assembly_path = _asm_glue.is_configured();
break;
@@ -111,7 +141,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
matrix_b = &_tmp_b;
// The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorInfo a_info(compute_interleaved_shape(*a->info()), 1, a->info()->data_type(), a->info()->quantization_info());
+ TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1, a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
// The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(), b->info()->quantization_info());
_tmp_a.allocator()->init(a_info);
@@ -125,7 +155,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
// Configure interleave kernel
{
auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
+ k->configure(a_to_use, &_tmp_a);
_mtx_a_reshape_kernel = std::move(k);
}
@@ -151,19 +181,19 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
}
// Configure Matrix B reduction kernel
- _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a->info()->dimension(0), false);
+ _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
}
// Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
if(_b_offset != 0)
{
- TensorInfo info_vector_sum_row(compute_reductionB_shape(*a->info()), 1, DataType::S32);
+ TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
_vector_sum_row.allocator()->init(info_vector_sum_row);
_memory_group.manage(&_vector_sum_row);
// Configure matrix A reduction kernel
- _mtx_a_reduction_kernel.configure(a, &_vector_sum_row, a->info()->dimension(0), false);
+ _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0), false);
}
if(_fuse_output_stage)
@@ -176,8 +206,17 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_mm_kernel = std::move(k);
}
- _offset_contribution_output_stage_kernel.configure(&_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, c, output, a->info()->dimension(0),
- _a_offset, _b_offset, gemm_info.gemmlowp_output_stage());
+ _offset_contribution_output_stage_kernel.configure(&_mm_result_s32,
+ _a_offset == 0 ? nullptr : &_vector_sum_col,
+ _b_offset == 0 ? nullptr : &_vector_sum_row, c,
+ _flip_signedness ? &_signed_output : output,
+ a->info()->dimension(0),
+ _a_offset, _b_offset, info.gemmlowp_output_stage());
+
+ if(_flip_signedness)
+ {
+ _convert_from_signed_asymm.configure(&_signed_output, output);
+ }
}
else
{
@@ -189,7 +228,7 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_mm_kernel = std::move(k);
}
// Configure offset contribution kernel
- _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a->info()->dimension(0), _a_offset, _b_offset);
+ _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->info()->dimension(0), _a_offset, _b_offset);
}
}
@@ -228,22 +267,31 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
{
_mm_result_s32.allocator()->allocate();
}
+
+ if(_flip_signedness)
+ {
+ _signed_a.allocator()->allocate();
+ _signed_output.allocator()->allocate();
+ }
}
Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
"The product AB is defined only if the number of columns in A is equal to the number of rows in B");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+ GEMMInfo info = gemm_info;
const ITensorInfo *matrix_a_info = a;
const ITensorInfo *matrix_b_info = b;
+ const ITensorInfo *a_to_use = a;
+
TensorInfo tmp_a_info{};
TensorInfo tmp_b_info{};
TensorInfo mm_result_s32_info{};
@@ -251,31 +299,57 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
int32_t a_offset = a->quantization_info().uniform().offset;
int32_t b_offset = b->quantization_info().uniform().offset;
- bool fuse_output_stage = gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
+ bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
if(fuse_output_stage)
{
auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
}
+ // Convert QASYMM8->QASYMM8_SIGNED
+ TensorInfo signed_a{};
+ TensorInfo signed_output{};
+ bool flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+ if(flip_signedness)
+ {
+ const int32_t offset_correction = 128;
+ const DataType dt = DataType::QASYMM8_SIGNED;
+ const UniformQuantizationInfo iqinfo = a_to_use->quantization_info().uniform();
+
+ signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
+ a_to_use = &signed_a;
+ a_offset = signed_a.quantization_info().uniform().offset;
+
+ const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+ signed_output = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+
+ // Output stage correction
+ GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
+ output_stage_corr.gemmlowp_offset = signed_output.quantization_info().uniform().offset;
+ output_stage_corr.gemmlowp_min_bound -= offset_correction;
+ output_stage_corr.gemmlowp_max_bound -= offset_correction;
+ info.set_gemmlowp_output_stage(output_stage_corr);
+ }
+
// Check if we need to run the optimized assembly kernel
bool run_optimised = false;
bool run_optimised_requantized = false;
- if(is_data_type_quantized_asymmetric(a->data_type()))
+ if(a_to_use->data_type() == DataType::QASYMM8 && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, c, output, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
run_optimised_requantized = run_optimised;
}
else
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
}
if(run_optimised)
{
ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
- if(gemm_info.depth_output_gemm3d() != 0)
+ if(info.depth_output_gemm3d() != 0)
{
- if(gemm_info.reinterpret_input_as_3d())
+ if(info.reinterpret_input_as_3d())
{
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -292,8 +366,8 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
}
else
{
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
if(!run_vector_matrix_multiplication)
@@ -312,10 +386,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
// Validate interleave kernel
- auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(shape_tmp_a));
+ auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a, &tmp_a_info));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
}
}
@@ -340,7 +414,7 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
// Configure matrix A reduction kernel
- ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, a->dimension(0), false));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, a->dimension(0), false));
}
if(fuse_output_stage)
@@ -354,8 +428,10 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
a_offset == 0 ? nullptr : &info_vector_sum_col,
b_offset == 0 ? nullptr : &info_vector_sum_row,
- c, output, a_offset, b_offset,
- gemm_info.gemmlowp_output_stage()));
+ c,
+ flip_signedness ? &signed_output : output,
+ a_offset, b_offset,
+ info.gemmlowp_output_stage()));
}
else
{
@@ -397,6 +473,12 @@ void NEGEMMLowpMatrixMultiplyCore::run()
NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
}
+ // Convert QASYMM8->QASYMM8_SIGNED
+ if(_flip_signedness)
+ {
+ NEScheduler::get().schedule(&_convert_to_signed_asymm, Window::DimY);
+ }
+
// Run GEMM
if(_asm_glue.is_configured())
{
@@ -433,6 +515,12 @@ void NEGEMMLowpMatrixMultiplyCore::run()
}
}
+ // Convert QASYMM8_SIGNED->QASYMM8
+ if(_flip_signedness)
+ {
+ NEScheduler::get().schedule(&_convert_from_signed_asymm, Window::DimY);
+ }
+
// Run fused activation
if(_run_activation)
{