diff options
Diffstat (limited to 'src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp')
-rw-r--r-- | src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp | 144 |
1 files changed, 121 insertions, 23 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp index 40abdb1672..52880a378f 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEAsymm.h" #include "arm_compute/core/NEON/NEFixedPoint.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" @@ -43,24 +44,26 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, + DataType::QS16, DataType::F16, + DataType::QS32, DataType::S32, DataType::F32); if(bias != nullptr) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::QS32, DataType::S32, DataType::F32); - if(is_data_type_quantized(input->data_type())) + if(is_data_type_fixed_point(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && bias->data_type() != DataType::QS8, "Wrong data type for bias"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && bias->data_type() != DataType::QS8, "Wrong data type for bias"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && bias->data_type() != DataType::QS16, "Wrong data type for bias"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias); } else { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); } else @@ -71,18 +74,22 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con // Checks performed when output is configured if((output != nullptr) && (output->total_size() != 0)) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32); - if(is_data_type_quantized(input->data_type())) + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F32); + if(is_data_type_fixed_point(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS8 && output->data_type() != DataType::QS8, "Wrong data type for output"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS16 && output->data_type() != DataType::QS8, "Wrong data type for output"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::QS32 && output->data_type() != DataType::QS16, "Wrong data type for output"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); + } + else if(is_data_type_quantized_asymmetric(output->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_type() == DataType::S32 && output->data_type() != DataType::QASYMM8, "Wrong data type for bias"); } else { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); } - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input, output); } return Status{}; @@ -90,8 +97,14 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output) { - bool window_changed = false; - const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type()); + bool window_changed = false; + unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->data_type()); + + // Update processed elements when input is S32 (comes from quantization input) + if(input->data_type() == DataType::S32) + { + num_elems_processed_per_iteration = 16; + } // Configure kernel window Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); @@ -145,7 +158,6 @@ inline qint16x8_t internal_vld1q(const qint16_t *in) { return vld1q_qs16(in); } - inline qint32x4_t internal_vld1q(const qint32_t *in) { return vld1q_s32(in); @@ -168,7 +180,6 @@ inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) { vst1q_qs16(p, v); } - inline void internal_vst1q(qint32_t *p, const qint32x4_t &v) { vst1q_s32(p, v); @@ -192,7 +203,6 @@ inline qint16x8_t internal_vdupq_n(qint16_t v) { return vdupq_n_qs16(v); } - inline qint32x4_t internal_vdupq_n(qint32_t v) { return vdupq_n_qs32(v); @@ -236,8 +246,13 @@ inline float16x8_t internal_vqaddq(const float16x8_t &x, const float16x8_t &y) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template <typename T1, typename T2, bool in_place, bool has_bias> -void output_stage(ITensor *input, const ITensor *bias, const Window window, ITensor *output) +void output_stage(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { + ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier); + ARM_COMPUTE_UNUSED(result_shift); + ARM_COMPUTE_UNUSED(result_offset_after_shift); + Iterator in(input, window); if(in_place) // In place accumulate @@ -283,31 +298,112 @@ void output_stage(ITensor *input, const ITensor *bias, const Window window, ITen in, out); } } + +// QASYMM8 specializations +template <> +void output_stage<int32_t, uint8_t, false, true>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + uint8x16_t min = vdupq_n_u8(0); + uint8x16_t max = vdupq_n_u8(255); + + Iterator in(input, window); + Iterator out(output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()); + int32x4x4_t v_in = + { + { + vld1q_s32(in_ptr), + vld1q_s32(in_ptr + 4), + vld1q_s32(in_ptr + 8), + vld1q_s32(in_ptr + 12) + } + }; + + // Accumulate bias + const auto vb = vdupq_n_s32(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())))); + v_in = + { + { + vaddq_s32(v_in.val[0], vb), + vaddq_s32(v_in.val[1], vb), + vaddq_s32(v_in.val[2], vb), + vaddq_s32(v_in.val[3], vb) + } + }; + + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max)); + }, + in, out); +} +template <> +void output_stage<int32_t, uint8_t, false, false>(ITensor *input, const ITensor *bias, const Window &window, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) +{ + ARM_COMPUTE_UNUSED(bias); + + const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift); + uint8x16_t min = vdupq_n_u8(0); + uint8x16_t max = vdupq_n_u8(255); + + Iterator in(input, window); + Iterator out(output, window); + execute_window_loop(window, [&](const Coordinates & id) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()); + int32x4x4_t v_in = + { + { + vld1q_s32(in_ptr), + vld1q_s32(in_ptr + 4), + vld1q_s32(in_ptr + 8), + vld1q_s32(in_ptr + 12) + } + }; + + const auto out_ptr = reinterpret_cast<uint8_t *>(out.ptr()); + vst1q_u8(out_ptr, finalize_quantization<false>(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max)); + }, + in, out); +} } // namespace NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel() - : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr) + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0) { } -void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output) +void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output, + int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); // Auto-initialize output output if required if(output != nullptr) { + // Work out expected output data type + const DataType output_dt = (input->info()->data_type() == DataType::S32) ? DataType::QASYMM8 : input->info()->data_type(); // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*output->info(), *input->info()); + auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt)); } // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info())); - _func = nullptr; - _bias = bias; - _input = input; - _output = output; + _func = nullptr; + _bias = bias; + _input = input; + _output = output; + _result_fixedpoint_multiplier = result_fixedpoint_multiplier; + _result_shift = result_shift; + _result_offset_after_shift = result_offset_after_shift; // Configure kernel window auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info()); @@ -350,6 +446,9 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const _func = (output == nullptr) ? &output_stage<qint32_t, qint16_t, true, true> : &output_stage<qint32_t, qint16_t, false, true>; break; } + case DataType::S32: + _func = (bias == nullptr) ? &output_stage<int32_t, uint8_t, false, false> : &output_stage<int32_t, uint8_t, false, true>; + break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: { @@ -365,7 +464,6 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const default: { ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); - break; } } } @@ -385,5 +483,5 @@ void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); ARM_COMPUTE_ERROR_ON(_func == nullptr); - (*_func)(_input, _bias, window, _output); + (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift); } |