From 7485d5a62685cb745ab50e970adb722cb71557ac Mon Sep 17 00:00:00 2001 From: Vidhya Sudhan Loganathan Date: Wed, 4 Jul 2018 09:34:00 +0100 Subject: COMPMID-970 : Remove QS8 / QS16 support Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 166 ++----------------------- 1 file changed, 7 insertions(+), 159 deletions(-) (limited to 'src/core/NEON/kernels/NESoftmaxLayerKernel.cpp') diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp index d91efd267f..9946f002de 100644 --- a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -194,56 +194,7 @@ T sqadd(T a, T b); template T sqsub(T a, T b); template -T sqmul(T a, T b, int fixed_point_position); - -#define DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(TYPET, TYPEU, TAGT, TAGU) \ - inline vec_8_byte_t vqsub(vec_8_byte_t a, vec_8_byte_t b) \ - { \ - return vqsub_##TAGT(a, b); \ - } \ - inline vec_8_byte_t vqadd(vec_8_byte_t a, vec_8_byte_t b) \ - { \ - return vqadd_##TAGU(a, b); \ - } \ - inline vec_16_byte_t vqadd(vec_16_byte_t a, vec_16_byte_t b) \ - { \ - return vqaddq_##TAGU(a, b); \ - } \ - inline vec_8_byte_t vqexp(vec_8_byte_t vec, int fixed_point_position) \ - { \ - return vqexp_q##TAGT(vec, fixed_point_position); \ - } \ - inline auto vmovl(vec_8_byte_t vec)->decltype(vmovl_##TAGT(vec)) \ - { \ - return vmovl_##TAGT(vec); \ - } \ - inline vec_16_byte_t vqrecip(vec_16_byte_t vec, int fixed_point_position) \ - { \ - return vqrecipq_q##TAGT(vec, fixed_point_position); \ - } \ - inline vec_16_byte_t vqmul(vec_16_byte_t a, vec_16_byte_t b, int fixed_point_position) \ - { \ - return vqmulq_q##TAGT(a, b, fixed_point_position); \ - } \ - template <> \ - inline TYPEU sqadd(TYPEU a, TYPEU b) \ - { \ - return sqadd_q##TAGU(a, b); \ - } \ - inline TYPET sqexp(TYPET val, int fixed_point_position) \ - { \ - return sqexp_q##TAGT(val, fixed_point_position); \ - } \ - template <> \ - inline TYPET sqsub(TYPET a, TYPET b) \ - { \ - return sqsub_q##TAGT(a, b); \ - } \ - template <> \ - inline TYPET sqmul(TYPET a, TYPET b, int fixed_point_position) \ - { \ - return sqmul_q##TAGT(a, b, fixed_point_position); \ - } +T sqmul(T a, T b); #define DECLARE_NEON_FUNCTIONS_FOR_FLOAT(TYPE, TAG) \ inline vec_8_byte_t vadd(vec_8_byte_t a, vec_8_byte_t b) \ @@ -278,9 +229,6 @@ DECLARE_NEON_FUNCTIONS_FOR_TYPE(float16_t, f16) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ DECLARE_NEON_FUNCTIONS_FOR_TYPE(float, f32) -DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int8_t, int16_t, s8, s16) -DECLARE_NEON_FUNCTIONS_FOR_FIXED_POINT(int16_t, int32_t, s16, s32) - #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC DECLARE_NEON_FUNCTIONS_FOR_FLOAT(float16_t, f16) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ @@ -373,16 +321,15 @@ namespace Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output) { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ // Validate in case of configured output if(output.total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1)); } @@ -395,7 +342,7 @@ std::pair validate_and_configure_window_logits_1d_max(ITensorInf // Softmax across the x dimension const TensorShape output_shape = TensorShape(input.tensor_shape()).set(0, 1); // Output auto initialization if not yet initialized - auto_init_if_empty(output, output_shape, 1, input.data_type(), input.fixed_point_position(), input.quantization_info()); + auto_init_if_empty(output, output_shape, 1, input.data_type(), input.quantization_info()); // Configure kernel window const int input_width = input.valid_region().shape.x(); @@ -488,12 +435,6 @@ void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) case DataType::QASYMM8: _func = &logits_1d_max; break; - case DataType::QS8: - _func = &logits_1d_max; - break; - case DataType::QS16: - _func = &logits_1d_max; - break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &logits_1d_max; @@ -543,11 +484,12 @@ namespace Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensorInfo &max, const ITensorInfo &output, const float beta, const ITensorInfo &tmp) { + ARM_COMPUTE_UNUSED(beta); // Check input #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F16, DataType::F32); #else /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32); #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(input.data_type()); @@ -555,7 +497,6 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor // Check max ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &max); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(TensorShape(input.tensor_shape()).set(0, 1), max.tensor_shape()); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &max); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &max); // Check output if configured @@ -564,19 +505,14 @@ Status validate_arguments_logits_softmax(const ITensorInfo &input, const ITensor const QuantizationInfo output_quantization = is_quantized_asymmetric ? QuantizationInfo(1.f / 256.f, 0) : output.quantization_info(); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &output); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &output); ARM_COMPUTE_RETURN_ERROR_ON(output.quantization_info() != output_quantization); } - // Check beta - ARM_COMPUTE_RETURN_ERROR_ON((beta != 1.0f) && is_data_type_fixed_point(input.data_type())); - // Check tmp if configured if(tmp.total_size() != 0) { const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : input.data_type(); ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(&input, &tmp); // We could potentially reduce tmp memory if we could predict or make an assumption // on the maximum number of threads that will run in parallel. ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input, &tmp); @@ -727,88 +663,6 @@ void logits_1d_softmax_qasymm8(const ITensor &in, const ITensor &max, void *cons in_it, max_it, out_it); } -template -void logits_1d_softmax_fixed_point(const ITensor &in, const ITensor &max, void *const tmp, - ITensor &out, const float /*beta*/, const Window &window) -{ - const int start_x = in.info()->valid_region().anchor.x(); - const int input_width = in.info()->valid_region().shape.x(); - - const int fixed_point_position = in.info()->fixed_point_position(); - - Iterator in_it(&in, window); - Iterator max_it(&max, window); - Iterator out_it(&out, window); - - execute_window_loop(window, [&](const Coordinates &) - { - /* Get pointers */ - const auto in_ptr = reinterpret_cast(in_it.ptr()) + start_x; - const auto out_ptr = reinterpret_cast(out_it.ptr()) + start_x; - const auto tmp_ptr = reinterpret_cast(tmp); - - vec_16_byte_t vec_sum_inversed; - - /* Compute exponentials and sum */ - { - /* Get max value */ - const auto max_val = *reinterpret_cast(max_it.ptr()); - const auto vec_max = vdup_n>(max_val); - - /* Init sum to zero */ - auto vec_sum = vdup_n>(0); - - /* Loop over row and compute exponentials and sum */ - int i = 0; - constexpr int vec_size = vec_size_of(vec_sum); - for(; i <= (input_width - vec_size); i += vec_size) - { - auto vec_elements = vld>(in_ptr + i); - vec_elements = vqsub(vec_elements, vec_max); - vec_elements = vqexp(vec_elements, fixed_point_position); - vec_sum = vqadd(vec_sum, vmovl(vec_elements)); - vst(tmp_ptr + i, vec_elements); - } - /* Reduce sum */ - const vec_8_byte_t sum_8_byte = vqadd(vget_high(vec_sum), vget_low(vec_sum)); - U sum = reduce_add(sqadd, sum_8_byte); - - /* Run remaining elements */ - for(; i < input_width; ++i) - { - T element = sqexp(sqsub(in_ptr[i], max_val), fixed_point_position); - sum = sqadd(sum, element); - tmp_ptr[i] = element; - } - - const auto qsum = utility::saturate_cast(sum); - vec_sum_inversed = vqrecip(vdup_n>(qsum), fixed_point_position); - } - - /* Normalize exponentials */ - { - /* Loop over row and compute softmax */ - int i = 0; - constexpr int vec_size = vec_size_of(vec_sum_inversed); - for(; i <= (input_width - vec_size); i += vec_size) - { - const auto vec_in = vld>(tmp_ptr + i); - const vec_16_byte_t normalized_value = vqmul(vec_in, vec_sum_inversed, fixed_point_position); - vst(out_ptr + i, normalized_value); - } - - const T sum_inversed = vget_lane<0>(vec_sum_inversed); - - /* Run remaining elements */ - for(; i < input_width; ++i) - { - out_ptr[i] = sqmul(tmp_ptr[i], sum_inversed, fixed_point_position); - } - } - }, - in_it, max_it, out_it); -} - template void logits_1d_softmax_float(const ITensor &in, const ITensor &max, void *const tmp, ITensor &out, const float beta, const Window &window) @@ -908,12 +762,6 @@ void NELogits1DSoftmaxKernel::configure(const ITensor *input, const ITensor *max case DataType::QASYMM8: _func = &logits_1d_softmax_qasymm8; break; - case DataType::QS8: - _func = &logits_1d_softmax_fixed_point; - break; - case DataType::QS16: - _func = &logits_1d_softmax_fixed_point; - break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: _func = &logits_1d_softmax_float; -- cgit v1.2.1