aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2018-01-12 16:29:45 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:43:42 +0000
commitf72f9367d1eddee91f15a64952b99ee6b80b821d (patch)
tree0d3296219ca7919c263b3701ab22b5468df86354 /src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
parenta026e981c607272181292b044c91f73a27d2bcd9 (diff)
downloadComputeLibrary-f72f9367d1eddee91f15a64952b99ee6b80b821d.tar.gz
COMPMID-791: Adds support of QASYMM8 in NEDepthwiseConvolution3x3
Change-Id: I1a9ed6c3420ddf8978aeaad48d9915333b006b49 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/116374 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp144
1 files changed, 121 insertions, 23 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 40abdb1672..52880a378f 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/core/Error.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/NEON/NEAsymm.h"
#include "arm_compute/core/NEON/NEFixedPoint.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/core/Validate.h"
@@ -43,24 +44,26 @@ namespace
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8,
+ DataType::QS16, DataType::F16,
+ DataType::QS32, DataType::S32, DataType::F32);
if(bias != nullptr)
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::S32, DataType::F32);
- if(is_data_type_quantized(input->data_type()))
+ if(is_data_type_fixed_point(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
}
else
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias);
ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
}
else
@@ -71,18 +74,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
// Checks performed when output is configured
if((output != nullptr) && (output->total_size() != 0))
{
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32);
- if(is_data_type_quantized(input->data_type()))
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32);
+ if(is_data_type_fixed_point(input->data_type()))
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && output->data_type() != DataType::QS8, "Wrong data type for output");
ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && output->data_type() != DataType::QS16, "Wrong data type for output");
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
+ }
+ else if(is_data_type_quantized_asymmetric(output->data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias");
}
else
{
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
}
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output);
}
return Status{};
@@ -90,8 +97,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
{
- bool window_changed = false;
- const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
+ bool window_changed = false;
+ unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type());
+
+ // Update processed elements when input is S32 (comes from quantization input)
+ if(input->data_type() == DataType::S32)
+ {
+ num_elems_processed_per_iteration = 16;
+ }
// Configure kernel window
Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -145,7 +158,6 @@ inline qint16x8_t internal_vld1q(const qint16_t *in)
{
return vld1q_qs16(in);
}
-
inline qint32x4_t internal_vld1q(const qint32_t *in)
{
return vld1q_s32(in);
@@ -168,7 +180,6 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v)
{
vst1q_qs16(p, v);
}
-
inline void internal_vst1q(qint32_t *p, const qint32x4_t &v)
{
vst1q_s32(p, v);
@@ -192,7 +203,6 @@ inline qint16x8_t internal_vdupq_n(qint16_t v)
{
return vdupq_n_qs16(v);
}
-
inline qint32x4_t internal_vdupq_n(qint32_t v)
{
return vdupq_n_qs32(v);
@@ -236,8 +246,13 @@ inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y)
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
template <typename T1, typename T2, bool in_place, bool has_bias>
-void output_stage(ITensor *input, const ITensor *bias, const Window window, ITensor *output)
+void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
+ ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
+ ARM_COMPUTE_UNUSED(result_shift);
+ ARM_COMPUTE_UNUSED(result_offset_after_shift);
+
Iterator in(input, window);
if(in_place) // In place accumulate
@@ -283,31 +298,112 @@ void output_stage(ITensor *input, const ITensor *bias, const Window window, ITen
in, out);
}
}
+
+// QASYMM8 specializations
+template <>
+void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Iterator in(input, window);
+ Iterator out(output, window);
+
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+ int32x4x4_t v_in =
+ {
+ {
+ vld1q_s32(in_ptr),
+ vld1q_s32(in_ptr + 4),
+ vld1q_s32(in_ptr + 8),
+ vld1q_s32(in_ptr + 12)
+ }
+ };
+
+ // Accumulate bias
+ const auto vb = vdupq_n_s32(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))));
+ v_in =
+ {
+ {
+ vaddq_s32(v_in.val[0], vb),
+ vaddq_s32(v_in.val[1], vb),
+ vaddq_s32(v_in.val[2], vb),
+ vaddq_s32(v_in.val[3], vb)
+ }
+ };
+
+ const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, out);
+}
+template <>
+void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+{
+ ARM_COMPUTE_UNUSED(bias);
+
+ const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);
+ uint8x16_t min = vdupq_n_u8(0);
+ uint8x16_t max = vdupq_n_u8(255);
+
+ Iterator in(input, window);
+ Iterator out(output, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
+ int32x4x4_t v_in =
+ {
+ {
+ vld1q_s32(in_ptr),
+ vld1q_s32(in_ptr + 4),
+ vld1q_s32(in_ptr + 8),
+ vld1q_s32(in_ptr + 12)
+ }
+ };
+
+ const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr());
+ vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max));
+ },
+ in, out);
+}
} // namespace
NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
- : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr)
+ : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
{
}
-void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output)
+void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output,
+ int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input);
// Auto-initialize output output if required
if(output != nullptr)
{
+ // Work out expected output data type
+ const DataType output_dt = (input->info()->data_type() == DataType::S32) ? DataType::QASYMM8 : input->info()->data_type();
// Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output->info(), *input->info());
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
}
// Perform validation step
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info()));
- _func = nullptr;
- _bias = bias;
- _input = input;
- _output = output;
+ _func = nullptr;
+ _bias = bias;
+ _input = input;
+ _output = output;
+ _result_fixedpoint_multiplier = result_fixedpoint_multiplier;
+ _result_shift = result_shift;
+ _result_offset_after_shift = result_offset_after_shift;
// Configure kernel window
auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info());
@@ -350,6 +446,9 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
_func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>;
break;
}
+ case DataType::S32:
+ _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>;
+ break;
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
{
@@ -365,7 +464,6 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
default:
{
ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
- break;
}
}
}
@@ -385,5 +483,5 @@ void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const
ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
ARM_COMPUTE_ERROR_ON(_func == nullptr);
- (*_func)(_input, _bias, window, _output);
+ (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift);
}