From 9247c92bd8c53be4d0c4ae931f51ca8f88e4150b Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Wed, 28 Jun 2017 18:29:47 +0100 Subject: COMPMID-428: Port NESoftmaxLayer to 16-bit fixed point. Change-Id: I65122950bab9124b9758c27096c0f458b77aeabb Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79365 Reviewed-by: Moritz Pflanzer Tested-by: Kaizen Reviewed-by: Steven Niu --- arm_compute/core/FixedPoint.h | 18 ++ arm_compute/core/FixedPoint.inl | 17 +- arm_compute/core/NEON/NEFixedPoint.h | 1 + arm_compute/core/NEON/NEFixedPoint.inl | 64 ++++- .../core/NEON/kernels/NESoftmaxLayerKernel.h | 6 +- .../runtime/NEON/functions/NESoftmaxLayer.h | 2 +- src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 305 +++++++++++++++------ src/runtime/NEON/functions/NESoftmaxLayer.cpp | 4 +- tests/validation/NEON/SoftmaxLayer.cpp | 49 +++- 9 files changed, 360 insertions(+), 106 deletions(-) diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h index 774125ec7d..f166d93c3e 100644 --- a/arm_compute/core/FixedPoint.h +++ b/arm_compute/core/FixedPoint.h @@ -29,6 +29,7 @@ namespace arm_compute using qint8_t = int8_t; /**< 8 bit fixed point scalar value */ using qint16_t = int16_t; /**< 16 bit fixed point scalar value */ using qint32_t = int32_t; /**< 32 bit fixed point scalar value */ +using qint64_t = int64_t; /**< 64 bit fixed point scalar value */ /** 8 bit fixed point scalar saturating shift left * @@ -100,6 +101,15 @@ qint8_t sqadd_qs8(qint8_t a, qint8_t b); */ qint16_t sqadd_qs16(qint16_t a, qint16_t b); +/** 32 bit fixed point scalar saturating add + * + * @param[in] a First 32 bit fixed point input + * @param[in] b Second 32 bit fixed point input + * + * @return The result of the 32 bit fixed point addition. The result is saturated in case of overflow + */ +qint32_t sqadd_qs32(qint32_t a, qint32_t b); + /** 8 bit fixed point scalar subtraction * * @param[in] a First 8 bit fixed point input @@ -332,6 +342,14 @@ qint16_t sqcvt_qs16_f32(float a, int fixed_point_position); * @return The narrowing conversion to 8 bit */ qint8_t sqmovn_qs16(qint16_t a); + +/** Scalar saturating move and narrow. + * + * @param[in] a Input to convert to 16 bit fixed point + * + * @return The narrowing conversion to 16 bit + */ +qint16_t sqmovn_qs32(qint32_t a); } #include "arm_compute/core/FixedPoint.inl" #endif /* __ARM_COMPUTE_FIXEDPOINT_H__ */ diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl index fdbc3f0c06..b921b32ed9 100644 --- a/arm_compute/core/FixedPoint.inl +++ b/arm_compute/core/FixedPoint.inl @@ -90,13 +90,22 @@ inline qint8_t sqadd_qs8(qint8_t a, qint8_t b) inline qint16_t sqadd_qs16(qint16_t a, qint16_t b) { - // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow + // We need to store the temporary result in qint32_t otherwise we cannot evaluate the overflow qint32_t tmp = (static_cast(a) + static_cast(b)); // Saturate the result in case of overflow and cast to qint16_t return saturate_convert(tmp); } +inline qint32_t sqadd_qs32(qint32_t a, qint32_t b) +{ + // We need to store the temporary result in qint64_t otherwise we cannot evaluate the overflow + qint64_t tmp = (static_cast(a) + static_cast(b)); + + // Saturate the result in case of overflow and cast to qint32_t + return saturate_convert(tmp); +} + inline qint8_t ssub_qs8(qint8_t a, qint8_t b) { return a - b; @@ -388,4 +397,10 @@ inline qint8_t sqmovn_qs16(qint16_t a) // Saturate the result in case of overflow and cast to qint8_t return saturate_convert(a); } + +inline qint16_t sqmovn_qs32(qint32_t a) +{ + // Saturate the result in case of overflow and cast to qint16_t + return saturate_convert(a); +} } diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h index e30509cd0a..09579f9120 100644 --- a/arm_compute/core/NEON/NEFixedPoint.h +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -46,6 +46,7 @@ using qint16x8_t = int16x8_t; /**< 16 bit fixed point vector with 8 elements using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */ using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */ using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */ +using qint32x2_t = int32x2_t; /**< 32 bit fixed point vector with 2 elements */ using qint32x4_t = int32x4_t; /**< 32 bit fixed point vector with 4 elements */ /** Get the lower half of a 16 elements vector diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl index b241dd5069..f62a338a61 100644 --- a/arm_compute/core/NEON/NEFixedPoint.inl +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -384,6 +384,11 @@ inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b) return vqadd_s16(a, b); } +inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b) +{ + return vqadd_s32(a, b); +} + inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b) { return vqaddq_s8(a, b); @@ -394,6 +399,11 @@ inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b) return vqaddq_s16(a, b); } +inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b) +{ + return vqaddq_s32(a, b); +} + inline int16x4_t vpaddl_qs8(qint8x8_t a) { return vpaddl_s8(a); @@ -1073,6 +1083,56 @@ inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position) return vshl_s16(x, shift_value); } +inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position) +{ + // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 + const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position)); // 2.823 + const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823 + const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); + + // Find shift value + const qint8x8_t shift_value = vqneg_s8(vsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); + const qint8x8_t temp = vqshl_s8(a, shift_value); + + qint8x8_t x = vqadd_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position)); + + uint8x8_t set_one = vcgt_s8(x, const_one); + x = vbsl_s8(set_one, const_one, x); + + // Use three iterations of Newton-Raphson method to get the result + x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vqadd_s8(x, vqmul_qs8(x, vqsub_s8(const_one, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position)); + + return vqshl_s8(x, shift_value); +} + +inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position) +{ + // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0 + const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823 + const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823 + const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position); + + // Find shift value + const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)))); + const qint16x4_t temp = vqshl_s16(a, shift_value); + + qint16x4_t x = vqadd_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position)); + + uint16x4_t set_one = vcgt_s16(x, const_one); + x = vbsl_s16(set_one, const_one, x); + + // Use five iterations of Newton-Raphson method to get the result + x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position)); + x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position)); + x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position)); + x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position)); + x = vqadd_s16(x, vmul_qs16(x, vqsub_s16(const_one, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position)); + + return vqshl_s16(x, shift_value); +} + inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position) { // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 @@ -1817,7 +1877,7 @@ inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position) qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position); qint8x8_t num = vqsub_qs8(exp2x, const_one); qint8x8_t den = vqadd_qs8(exp2x, const_one); - qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position); + qint8x8_t tanh = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position); return tanh; } @@ -1830,7 +1890,7 @@ inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position) qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position); qint16x4_t num = vqsub_qs16(exp2x, const_one); qint16x4_t den = vqadd_qs16(exp2x, const_one); - qint16x4_t tanh = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position); + qint16x4_t tanh = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position); return tanh; } diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h index ab626ad5ec..53eef8d665 100644 --- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h +++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h @@ -39,7 +39,7 @@ public: NELogits1DMaxKernel(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8, F32. + * @param[in] input Source tensor. Data types supported: QS8/QS16/F32. * @param[out] output Destination tensor. Data types supported: same as @p input */ void configure(const ITensor *input, ITensor *output); @@ -74,7 +74,7 @@ public: ~NELogits1DShiftExpSumKernel() = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8, F32. + * @param[in] input Source tensor. Data types supported: QS8/QS16/F32. * @param[in] max Max values tensor. Data types supported: same as @p input. * @param[out] output Destination tensor. Data types supported: same as @p input. * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input. @@ -113,7 +113,7 @@ public: ~NELogits1DNormKernel() = default; /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8, F32. + * @param[in] input Source tensor. Data types supported: QS8/QS16/F32. * @param[in] sum Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input. * @param[out] output Destination tensor. Data types supported: same as @p input. */ diff --git a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h index dc84dec0e4..44a69d8c19 100644 --- a/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h +++ b/arm_compute/runtime/NEON/functions/NESoftmaxLayer.h @@ -50,7 +50,7 @@ public: NESoftmaxLayer(); /** Set the input and output tensors. * - * @param[in] input Source tensor. Data types supported: QS8/F32. + * @param[in] input Source tensor. Data types supported: QS8/QS16/F32. * @param[out] output Destination tensor. Data types supported: same as @p input. */ void configure(ITensor *input, ITensor *output); diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index 854fd84845..fe62d7b575 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -43,7 +43,7 @@ using namespace arm_compute; namespace { -void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window) +void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window) { Window in_slice = window.first_slice_window_1D(); @@ -56,25 +56,57 @@ void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window) Iterator input(in, in_slice); Iterator output(out, max_slice); - float32x4_t vec_max = vdupq_n_f32(-FLT_MAX); + qint8x16_t vec_max = vdupq_n_s8(std::numeric_limits::lowest()); execute_window_loop(in_slice, [&](const Coordinates & id) { - const auto in_ptr = reinterpret_cast(input.ptr()); - const float32x4_t current_value = vld1q_f32(in_ptr); - vec_max = vmaxq_f32(vec_max, current_value); + const auto in_ptr = reinterpret_cast(input.ptr()); + const qint8x16_t current_value = vld1q_qs8(in_ptr); + vec_max = vmaxq_qs8(vec_max, current_value); }, input); - float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max)); - carry_max = vpmax_f32(carry_max, carry_max); + qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max)); + carry_max = vpmax_qs8(carry_max, carry_max); + carry_max = vpmax_qs8(carry_max, carry_max); + carry_max = vpmax_qs8(carry_max, carry_max); - *(reinterpret_cast(output.ptr())) = vget_lane_f32(carry_max, 0); + *(reinterpret_cast(output.ptr())) = vget_lane_s8(carry_max, 0); } while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); } +void logits_1d_max_qs16(const ITensor *in, ITensor *out, const Window &window) +{ + Window in_slice = window.first_slice_window_1D(); -void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window) + Window window_max(window); + window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window max_slice = window_max.first_slice_window_1D(); + + do + { + Iterator input(in, in_slice); + Iterator output(out, max_slice); + + qint16x8_t vec_max = vdupq_n_qs16(std::numeric_limits::lowest()); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const qint16x8_t current_value = vld1q_qs16(in_ptr); + vec_max = vmaxq_qs16(vec_max, current_value); + }, + input); + + qint16x4_t carry_max = vpmax_qs16(vget_high_qs16(vec_max), vget_low_qs16(vec_max)); + carry_max = vpmax_qs16(carry_max, carry_max); + carry_max = vpmax_qs16(carry_max, carry_max); + + *(reinterpret_cast(output.ptr())) = vget_lane_s16(carry_max, 0); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); +} +void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window) { Window in_slice = window.first_slice_window_1D(); @@ -87,22 +119,20 @@ void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window) Iterator input(in, in_slice); Iterator output(out, max_slice); - qint8x16_t vec_max = vdupq_n_s8(-1); + float32x4_t vec_max = vdupq_n_f32(-FLT_MAX); execute_window_loop(in_slice, [&](const Coordinates & id) { - const auto in_ptr = reinterpret_cast(input.ptr()); - const qint8x16_t current_value = vld1q_qs8(in_ptr); - vec_max = vmaxq_qs8(vec_max, current_value); + const auto in_ptr = reinterpret_cast(input.ptr()); + const float32x4_t current_value = vld1q_f32(in_ptr); + vec_max = vmaxq_f32(vec_max, current_value); }, input); - qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max)); - carry_max = vpmax_qs8(carry_max, carry_max); - carry_max = vpmax_qs8(carry_max, carry_max); - carry_max = vpmax_qs8(carry_max, carry_max); + float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max)); + carry_max = vpmax_f32(carry_max, carry_max); - *(reinterpret_cast(output.ptr())) = vget_lane_s8(carry_max, 0); + *(reinterpret_cast(output.ptr())) = vget_lane_f32(carry_max, 0); } while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); } @@ -120,7 +150,7 @@ BorderSize NELogits1DMaxKernel::border_size() const void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); // Softmax across the x dimension @@ -135,17 +165,18 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); const int input_width = input->info()->valid_region().shape.x(); - unsigned int num_elems_processed_per_iteration = 0; + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); switch(input->info()->data_type()) { case DataType::QS8: - _func = &logits_1d_max_qs8; - num_elems_processed_per_iteration = 16; + _func = &logits_1d_max_qs8; + break; + case DataType::QS16: + _func = &logits_1d_max_qs16; break; case DataType::F32: - num_elems_processed_per_iteration = 4; - _func = &logits_1d_max_f32; + _func = &logits_1d_max_f32; break; default: ARM_COMPUTE_ERROR("Unsupported data type."); @@ -180,7 +211,7 @@ void NELogits1DMaxKernel::run(const Window &window) namespace { -void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) +void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) { Window window_max(window); window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -188,9 +219,10 @@ void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor Window max_slice = window_max.first_slice_window_1D(); Window in_slice = window.first_slice_window_1D(); - constexpr int step = 4; - const int long_steps = in->info()->valid_region().shape.x() / step; - const int small_steps = in->info()->valid_region().shape.x() % step; + constexpr int step = 8; + const int long_steps = in->info()->valid_region().shape.x() / step; + const int small_steps = in->info()->valid_region().shape.x() % step; + const int fixed_point_position = in->info()->fixed_point_position(); do { @@ -200,48 +232,48 @@ void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor Iterator _sum(sum, max_slice); // Get pointers - auto in_ptr = reinterpret_cast(input.ptr()); - auto exp_ptr = reinterpret_cast(exp.ptr()); + auto in_ptr = reinterpret_cast(input.ptr()); + auto exp_ptr = reinterpret_cast(exp.ptr()); // Init sum to zero - float32x4_t vec_sum_value = vdupq_n_f32(0.0f); + qint16x8_t vec_sum_value = vdupq_n_qs16(0); // Get max value - const auto max_ptr = reinterpret_cast(_max.ptr()); - const float32x4_t vec_max = vdupq_n_f32(*max_ptr); + const auto max_ptr = reinterpret_cast(_max.ptr()); + const qint8x8_t vec_max = vdup_n_qs8(*max_ptr); // Run neon loop for(int i = 0; i < long_steps; ++i) { - float32x4_t vec_elements = vld1q_f32(in_ptr); - vec_elements = vsubq_f32(vec_elements, vec_max); - vec_elements = vexpq_f32(vec_elements); + qint8x8_t vec_elements = vld1_qs8(in_ptr); + vec_elements = vqsub_qs8(vec_elements, vec_max); + vec_elements = vqexp_qs8(vec_elements, fixed_point_position); - vst1q_f32(exp_ptr, vec_elements); - vec_sum_value = vaddq_f32(vec_elements, vec_sum_value); + vst1_qs8(exp_ptr, vec_elements); + vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements)); in_ptr += step; exp_ptr += step; } - // Reduce sum - float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value)); - carry_addition = vpadd_f32(carry_addition, carry_addition); - float sum = vget_lane_f32(carry_addition, 0); + const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value)); + const qint16_t sum0 = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1)); + const qint16_t sum1 = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3)); + qint16_t sum = sqadd_qs16(sum0, sum1); // Run remaining elements for(int i = 0; i < small_steps; ++i) { - float element = std::exp(in_ptr[i] - *max_ptr); - exp_ptr[i] = element; - sum += element; + qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position); + exp_ptr[i] = element; + sum = sqadd_qs16(sum, element); } - *(reinterpret_cast(_sum.ptr())) = sum; + *(reinterpret_cast(_sum.ptr())) = sqmovn_qs16(sum); } while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); } -void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) +void logits_1d_shift_exp_sum_qs16(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) { Window window_max(window); window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -249,7 +281,7 @@ void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor Window max_slice = window_max.first_slice_window_1D(); Window in_slice = window.first_slice_window_1D(); - constexpr int step = 8; + constexpr int step = 4; const int long_steps = in->info()->valid_region().shape.x() / step; const int small_steps = in->info()->valid_region().shape.x() % step; const int fixed_point_position = in->info()->fixed_point_position(); @@ -262,44 +294,103 @@ void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor Iterator _sum(sum, max_slice); // Get pointers - auto in_ptr = reinterpret_cast(input.ptr()); - auto exp_ptr = reinterpret_cast(exp.ptr()); + auto in_ptr = reinterpret_cast(input.ptr()); + auto exp_ptr = reinterpret_cast(exp.ptr()); // Init sum to zero - qint16x8_t vec_sum_value = vdupq_n_qs16(0); + qint32x4_t vec_sum_value = vdupq_n_qs32(0); // Get max value - const auto max_ptr = reinterpret_cast(_max.ptr()); - const qint8x8_t vec_max = vdup_n_qs8(*max_ptr); + const auto max_ptr = reinterpret_cast(_max.ptr()); + const qint16x4_t vec_max = vdup_n_qs16(*max_ptr); // Run neon loop for(int i = 0; i < long_steps; ++i) { - qint8x8_t vec_elements = vld1_qs8(in_ptr); - vec_elements = vqsub_qs8(vec_elements, vec_max); - vec_elements = vqexp_qs8(vec_elements, fixed_point_position); + qint16x4_t vec_elements = vld1_qs16(in_ptr); + vec_elements = vqsub_qs16(vec_elements, vec_max); + vec_elements = vqexp_qs16(vec_elements, fixed_point_position); - vst1_qs8(exp_ptr, vec_elements); - vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements)); + vst1_qs16(exp_ptr, vec_elements); + vec_sum_value = vqaddq_qs32(vec_sum_value, vmovl_s16(vec_elements)); in_ptr += step; exp_ptr += step; } // Reduce sum - const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value)); - const qint16_t sum0 = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1)); - const qint16_t sum1 = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3)); - qint16_t sum = sqadd_qs16(sum0, sum1); + qint32x2_t carry_addition = vqadd_qs32(vget_high_s32(vec_sum_value), vget_low_s32(vec_sum_value)); + qint32_t sum = vget_lane_s32(carry_addition, 0) + vget_lane_s32(carry_addition, 1); // Run remaining elements for(int i = 0; i < small_steps; ++i) { - qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position); - exp_ptr[i] = element; - sum = sqadd_qs16(sum, element); + qint16_t element = sqexp_qs16(sqsub_qs16(in_ptr[i], *max_ptr), fixed_point_position); + exp_ptr[i] = element; + sum = sqadd_qs32(sum, element); } - *(reinterpret_cast(_sum.ptr())) = sqmovn_qs16(sum); + *(reinterpret_cast(_sum.ptr())) = sqmovn_qs32(sum); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); +} +void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) +{ + Window window_max(window); + window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + + Window max_slice = window_max.first_slice_window_1D(); + Window in_slice = window.first_slice_window_1D(); + + constexpr int step = 4; + const int long_steps = in->info()->valid_region().shape.x() / step; + const int small_steps = in->info()->valid_region().shape.x() % step; + + do + { + Iterator input(in, in_slice); + Iterator exp(out, in_slice); + Iterator _max(max, max_slice); + Iterator _sum(sum, max_slice); + + // Get pointers + auto in_ptr = reinterpret_cast(input.ptr()); + auto exp_ptr = reinterpret_cast(exp.ptr()); + + // Init sum to zero + float32x4_t vec_sum_value = vdupq_n_f32(0.0f); + + // Get max value + const auto max_ptr = reinterpret_cast(_max.ptr()); + const float32x4_t vec_max = vdupq_n_f32(*max_ptr); + + // Run neon loop + for(int i = 0; i < long_steps; ++i) + { + float32x4_t vec_elements = vld1q_f32(in_ptr); + vec_elements = vsubq_f32(vec_elements, vec_max); + vec_elements = vexpq_f32(vec_elements); + + vst1q_f32(exp_ptr, vec_elements); + vec_sum_value = vaddq_f32(vec_elements, vec_sum_value); + + in_ptr += step; + exp_ptr += step; + } + + // Reduce sum + float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value)); + carry_addition = vpadd_f32(carry_addition, carry_addition); + float sum = vget_lane_f32(carry_addition, 0); + + // Run remaining elements + for(int i = 0; i < small_steps; ++i) + { + float element = std::exp(in_ptr[i] - *max_ptr); + exp_ptr[i] = element; + sum += element; + } + + *(reinterpret_cast(_sum.ptr())) = sum; } while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); } @@ -312,7 +403,7 @@ NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel() void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(max, sum, output); // Output auto initialization if not yet initialized @@ -331,6 +422,9 @@ void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor case DataType::QS8: _func = &logits_1d_shift_exp_sum_qs8; break; + case DataType::QS16: + _func = &logits_1d_shift_exp_sum_qs16; + break; case DataType::F32: _func = &logits_1d_shift_exp_sum_f32; break; @@ -369,37 +463,39 @@ void NELogits1DShiftExpSumKernel::run(const Window &window) namespace { -void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) +void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) { Window window_sum(window); window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); Window sum_slice = window_sum.first_slice_window_1D(); Window in_slice = window.first_slice_window_1D(); + const int fixed_point_position = in->info()->fixed_point_position(); + do { Iterator input(in, in_slice); Iterator _sum(sum, sum_slice); Iterator output(out, in_slice); - const float sum_value = *reinterpret_cast(_sum.ptr()); - const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value); + const int8_t sum_value = *reinterpret_cast(_sum.ptr()); + const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position); execute_window_loop(in_slice, [&](const Coordinates & id) { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); - const float32x4_t vec_in = vld1q_f32(in_ptr); - const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed); + const qint8x16_t vec_in = vld1q_qs8(in_ptr); + const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position); - vst1q_f32(out_ptr, normalized_value); + vst1q_qs8(out_ptr, normalized_value); }, input, output); } while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); } -void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) +void logits_1d_norm_qs16(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) { Window window_sum(window); window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); @@ -414,18 +510,48 @@ void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, con Iterator _sum(sum, sum_slice); Iterator output(out, in_slice); - const int8_t sum_value = *reinterpret_cast(_sum.ptr()); - const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position); + const int16_t sum_value = *reinterpret_cast(_sum.ptr()); + const qint16x8_t vec_sum_inversed = vqrecipq_qs16(vdupq_n_qs16(sum_value), fixed_point_position); execute_window_loop(in_slice, [&](const Coordinates & id) { - const auto in_ptr = reinterpret_cast(input.ptr()); - const auto out_ptr = reinterpret_cast(output.ptr()); + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); - const qint8x16_t vec_in = vld1q_qs8(in_ptr); - const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position); + const qint16x8_t vec_in = vld1q_qs16(in_ptr); + const qint16x8_t normalized_value = vqmulq_qs16(vec_in, vec_sum_inversed, fixed_point_position); - vst1q_qs8(out_ptr, normalized_value); + vst1q_qs16(out_ptr, normalized_value); + }, + input, output); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); +} +void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) +{ + Window window_sum(window); + window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window sum_slice = window_sum.first_slice_window_1D(); + Window in_slice = window.first_slice_window_1D(); + + do + { + Iterator input(in, in_slice); + Iterator _sum(sum, sum_slice); + Iterator output(out, in_slice); + + const float sum_value = *reinterpret_cast(_sum.ptr()); + const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); + + const float32x4_t vec_in = vld1q_f32(in_ptr); + const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed); + + vst1q_f32(out_ptr, normalized_value); }, input, output); } @@ -440,7 +566,7 @@ NELogits1DNormKernel::NELogits1DNormKernel() void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(sum, output); // Output auto initialization if not yet initialized @@ -455,17 +581,18 @@ void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, I _output = output; // Configure kernel window - unsigned int num_elems_processed_per_iteration = 0; + unsigned int num_elems_processed_per_iteration = 16 / data_size_from_type(input->info()->data_type()); switch(input->info()->data_type()) { case DataType::QS8: - _func = &logits_1d_norm_qs8; - num_elems_processed_per_iteration = 16; + _func = &logits_1d_norm_qs8; + break; + case DataType::QS16: + _func = &logits_1d_norm_qs16; break; case DataType::F32: - num_elems_processed_per_iteration = 4; - _func = &logits_1d_norm_f32; + _func = &logits_1d_norm_f32; break; default: ARM_COMPUTE_ERROR("Unsupported data type."); diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp index ca81b95473..7dfa927981 100644 --- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp +++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp @@ -38,7 +38,7 @@ NESoftmaxLayer::NESoftmaxLayer() void NESoftmaxLayer::configure(ITensor *input, ITensor *output) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32); // Create intermediate tensors shapes TensorInfo tensor_info_tmp(input->info()->tensor_shape(), input->info()->num_channels(), input->info()->data_type(), input->info()->fixed_point_position()); @@ -54,7 +54,7 @@ void NESoftmaxLayer::configure(ITensor *input, ITensor *output) _max_kernel.configure(input, &_max); _shift_exp_sum_kernel.configure(input, &_max, &_tmp, &_sum); _norm_kernel.configure(&_tmp, &_sum, output); - _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::CONSTANT, PixelValue(-FLT_MAX)); + _fill_border_kernel.configure(input, _max_kernel.border_size(), BorderMode::REPLICATE); // Allocate intermediate tensors _tmp.allocator()->allocate(); diff --git a/tests/validation/NEON/SoftmaxLayer.cpp b/tests/validation/NEON/SoftmaxLayer.cpp index 549463962a..9ac81af278 100644 --- a/tests/validation/NEON/SoftmaxLayer.cpp +++ b/tests/validation/NEON/SoftmaxLayer.cpp @@ -161,36 +161,69 @@ BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * CNNFloatDataTypes(), shape, dt) BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE(Quantized) +BOOST_AUTO_TEST_SUITE(QS8) // Testing for fixed point position [1,6) as reciprocal limits the maximum fixed point position to 5 BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) -BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(1, 6), - shape, dt, fixed_point_position) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::xrange(1, 6), + shape, fixed_point_position) { // Compute function - Tensor dst = compute_softmax_layer(shape, dt, fixed_point_position); + Tensor dst = compute_softmax_layer(shape, DataType::QS8, fixed_point_position); // Compute reference - RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt, fixed_point_position); + RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS8, fixed_point_position); // Validate output validate(NEAccessor(dst), ref_dst, tolerance_fixed_point); } BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) -BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * CNNFixedPointDataTypes() * boost::unit_test::data::xrange(1, 6), - shape, dt, fixed_point_position) +BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::xrange(1, 6), + shape, fixed_point_position) { // Compute function - Tensor dst = compute_softmax_layer(shape, dt, fixed_point_position); + Tensor dst = compute_softmax_layer(shape, DataType::QS8, fixed_point_position); // Compute reference - RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, dt, fixed_point_position); + RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS8, fixed_point_position); // Validate output validate(NEAccessor(dst), ref_dst, tolerance_fixed_point); } BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE(QS16) +// Testing for fixed point position [1,14) as reciprocal limits the maximum fixed point position to 14 +BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit")) +BOOST_DATA_TEST_CASE(RunSmall, SmallShapes() * boost::unit_test::data::xrange(1, 14), + shape, fixed_point_position) +{ + // Compute function + Tensor dst = compute_softmax_layer(shape, DataType::QS16, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS16, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst, tolerance_fixed_point); +} + +BOOST_TEST_DECORATOR(*boost::unit_test::label("nightly")) +BOOST_DATA_TEST_CASE(RunLarge, LargeShapes() * boost::unit_test::data::xrange(1, 14), + shape, fixed_point_position) +{ + // Compute function + Tensor dst = compute_softmax_layer(shape, DataType::QS16, fixed_point_position); + + // Compute reference + RawTensor ref_dst = Reference::compute_reference_softmax_layer(shape, DataType::QS16, fixed_point_position); + + // Validate output + validate(NEAccessor(dst), ref_dst, tolerance_fixed_point); +} +BOOST_AUTO_TEST_SUITE_END() +BOOST_AUTO_TEST_SUITE_END() + BOOST_AUTO_TEST_SUITE_END() BOOST_AUTO_TEST_SUITE_END() #endif /* DOXYGEN_SKIP_THIS */ -- cgit v1.2.1